[dpdk-dev] [21.08 PATCH v3 1/1] power: refactor pstate sysfs handling
Currently, pstate sysfs handling code is a bit of an unmaintainable mess, which has contributed to various errors leading to bugs. Refactor the code in a way that makes it more maintainable and less error prone. Signed-off-by: Anatoly Burakov --- lib/power/meson.build| 7 + lib/power/power_pstate_cpufreq.c | 357 --- 2 files changed, 191 insertions(+), 173 deletions(-) diff --git a/lib/power/meson.build b/lib/power/meson.build index a2cc9fe2ef..85324d48d2 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -5,6 +5,13 @@ if not is_linux build = false reason = 'only supported on Linux' endif + +# we do some snprintf magic so silence format-nonliteral +flag_nonliteral = '-Wno-format-nonliteral' +if cc.has_argument(flag_nonliteral) + cflags += flag_nonliteral +endif + sources = files( 'guest_channel.c', 'power_acpi_cpufreq.c', diff --git a/lib/power/power_pstate_cpufreq.c b/lib/power/power_pstate_cpufreq.c index 2cfc54acf3..4357ac4920 100644 --- a/lib/power/power_pstate_cpufreq.c +++ b/lib/power/power_pstate_cpufreq.c @@ -37,6 +37,13 @@ } \ } while (0) +#define FOPEN_OR_ERR_GOTO(f, label) do { \ + if ((f) == NULL) { \ + RTE_LOG(ERR, POWER, "File not opened\n"); \ + goto label; \ + } \ +} while (0) + #define FOPS_OR_NULL_GOTO(ret, label) do { \ if ((ret) == NULL) { \ RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \ @@ -148,97 +155,145 @@ out: close(fd); return ret; } +static int +open_core_sysfs_file(const char *template, unsigned int core, const char *mode, + FILE **f) +{ + char fullpath[PATH_MAX]; + FILE *tmpf; + + /* silenced -Wformat-nonliteral here */ + snprintf(fullpath, sizeof(fullpath), template, core); + tmpf = fopen(fullpath, mode); + if (tmpf == NULL) + return -1; + *f = tmpf; + + return 0; +} + +static int +read_core_sysfs_u32(FILE *f, uint32_t *val) +{ + char buf[BUFSIZ]; + uint32_t fval; + char *s; + + s = fgets(buf, sizeof(buf), f); + if (s == NULL) + return -1; + + /* fgets puts null terminator in, but do this just in case */ + buf[BUFSIZ - 1] = '\0'; + + /* strip off any terminating newlines */ + *strchrnul(buf, '\n') = '\0'; + + fval = strtoul(buf, NULL, POWER_CONVERT_TO_DECIMAL); + + /* write the value */ + *val = fval; + + return 0; +} + +static int +read_core_sysfs_s(FILE *f, char *buf, unsigned int len) +{ + char *s; + + s = fgets(buf, len, f); + if (s == NULL) + return -1; + + /* fgets puts null terminator in, but do this just in case */ + buf[len - 1] = '\0'; + + /* strip off any terminating newlines */ + *strchrnul(buf, '\n') = '\0'; + + return 0; +} + +static int +write_core_sysfs_s(FILE *f, const char *str) +{ + int ret; + + ret = fseek(f, 0, SEEK_SET); + if (ret != 0) + return -1; + + ret = fputs(str, f); + if (ret != 0) + return -1; + + /* flush the output */ + ret = fflush(f); + if (ret != 0) + return -1; + + return 0; +} + /** * It is to fopen the sys file for the future setting the lcore frequency. */ static int power_init_for_setting_freq(struct pstate_power_info *pi) { - FILE *f_min, *f_max, *f_base = NULL, *f_base_max; - char fullpath_min[PATH_MAX]; - char fullpath_max[PATH_MAX]; - char fullpath_base[PATH_MAX]; - char fullpath_base_max[PATH_MAX]; - char buf_base[BUFSIZ]; - char *s_base; - char *s_base_max; - uint32_t base_ratio = 0; - uint32_t base_max_ratio = 0; - uint64_t max_non_turbo = 0; - int ret_val = 0; - - snprintf(fullpath_base_max, - sizeof(fullpath_base_max), - POWER_SYSFILE_BASE_MAX_FREQ, - pi->lcore_id); - f_base_max = fopen(fullpath_base_max, "r"); - FOPEN_OR_ERR_RET(f_base_max, -1); - if (f_base_max != NULL) { - s_base_max = fgets(buf_base, sizeof(buf_base), f_base_max); - - /* close the file unconditionally */ - fclose(f_base_max); - f_base_max = NULL; - - FOPS_OR_NULL_GOTO(s_base_max, out); - - buf_base[BUFSIZ-1] = '\0'; - if (strlen(buf_base)) - /* Strip off terminating '\n' */ - strtok(buf_base, "\n"); - - base_max_ratio = - strtoul(buf_base, NULL, POWER_CONVERT_TO_DECIMAL) -
[dpdk-dev] [PATCH v1 1/1] power: do not skip saving original acpi governor
Currently, when we set the acpi governor to "userspace", we check if it is already set to this value, and if it is, we skip setting it. However, we never save this value anywhere, so that next time we come back and request the governor to be set to its original value, the original value is empty. Fix it by saving the original pstate governor first. While we're at it, replace `strlcpy` with `rte_strscpy`. Fixes: 445c6528b55f ("power: common interface for guest and host") Cc: david.h...@intel.com Signed-off-by: Anatoly Burakov --- lib/power/power_acpi_cpufreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/power/power_acpi_cpufreq.c b/lib/power/power_acpi_cpufreq.c index 84a9d75207..d028a9947f 100644 --- a/lib/power/power_acpi_cpufreq.c +++ b/lib/power/power_acpi_cpufreq.c @@ -152,6 +152,9 @@ power_set_governor_userspace(struct rte_power_info *pi) /* Strip off terminating '\n' */ strtok(buf, "\n"); + /* Save the original governor */ + rte_strscpy(pi->governor_ori, buf, sizeof(pi->governor_ori)); + /* Check if current governor is userspace */ if (strncmp(buf, POWER_GOVERNOR_USERSPACE, sizeof(POWER_GOVERNOR_USERSPACE)) == 0) { @@ -160,8 +163,6 @@ power_set_governor_userspace(struct rte_power_info *pi) "already userspace\n", pi->lcore_id); goto out; } - /* Save the original governor */ - strlcpy(pi->governor_ori, buf, sizeof(pi->governor_ori)); /* Write 'userspace' to the governor */ val = fseek(f, 0, SEEK_SET); -- 2.25.1
[dpdk-dev] [21.08 PATCH v4 1/2] power: don't use rte prefix in internal code
Currently, ACPI code uses rte_power_info as the struct name, which gives the appearance that this is an externally visible API. Fix to use internal namespace. Signed-off-by: Anatoly Burakov --- lib/power/power_acpi_cpufreq.c | 34 +- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/lib/power/power_acpi_cpufreq.c b/lib/power/power_acpi_cpufreq.c index d028a9947f..1b8c69cc8b 100644 --- a/lib/power/power_acpi_cpufreq.c +++ b/lib/power/power_acpi_cpufreq.c @@ -78,7 +78,7 @@ enum power_state { /** * Power info per lcore. */ -struct rte_power_info { +struct acpi_power_info { unsigned int lcore_id; /**< Logical core id */ uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */ uint32_t nb_freqs; /**< number of available freqs */ @@ -90,14 +90,14 @@ struct rte_power_info { uint16_t turbo_enable; /**< Turbo Boost enable/disable */ } __rte_cache_aligned; -static struct rte_power_info lcore_power_info[RTE_MAX_LCORE]; +static struct acpi_power_info lcore_power_info[RTE_MAX_LCORE]; /** * It is to set specific freq for specific logical core, according to the index * of supported frequencies. */ static int -set_freq_internal(struct rte_power_info *pi, uint32_t idx) +set_freq_internal(struct acpi_power_info *pi, uint32_t idx) { if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) { RTE_LOG(ERR, POWER, "Invalid frequency index %u, which " @@ -133,7 +133,7 @@ set_freq_internal(struct rte_power_info *pi, uint32_t idx) * governor will be saved for rolling back. */ static int -power_set_governor_userspace(struct rte_power_info *pi) +power_set_governor_userspace(struct acpi_power_info *pi) { FILE *f; int ret = -1; @@ -189,7 +189,7 @@ power_set_governor_userspace(struct rte_power_info *pi) * sys file. */ static int -power_get_available_freqs(struct rte_power_info *pi) +power_get_available_freqs(struct acpi_power_info *pi) { FILE *f; int ret = -1, i, count; @@ -259,7 +259,7 @@ power_get_available_freqs(struct rte_power_info *pi) * It is to fopen the sys file for the future setting the lcore frequency. */ static int -power_init_for_setting_freq(struct rte_power_info *pi) +power_init_for_setting_freq(struct acpi_power_info *pi) { FILE *f; char fullpath[PATH_MAX]; @@ -299,7 +299,7 @@ power_acpi_cpufreq_check_supported(void) int power_acpi_cpufreq_init(unsigned int lcore_id) { - struct rte_power_info *pi; + struct acpi_power_info *pi; uint32_t exp_state; if (lcore_id >= RTE_MAX_LCORE) { @@ -374,7 +374,7 @@ power_acpi_cpufreq_init(unsigned int lcore_id) * needed by writing the sys file. */ static int -power_set_governor_original(struct rte_power_info *pi) +power_set_governor_original(struct acpi_power_info *pi) { FILE *f; int ret = -1; @@ -420,7 +420,7 @@ power_set_governor_original(struct rte_power_info *pi) int power_acpi_cpufreq_exit(unsigned int lcore_id) { - struct rte_power_info *pi; + struct acpi_power_info *pi; uint32_t exp_state; if (lcore_id >= RTE_MAX_LCORE) { @@ -475,7 +475,7 @@ power_acpi_cpufreq_exit(unsigned int lcore_id) uint32_t power_acpi_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num) { - struct rte_power_info *pi; + struct acpi_power_info *pi; if (lcore_id >= RTE_MAX_LCORE) { RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); @@ -522,7 +522,7 @@ power_acpi_cpufreq_set_freq(unsigned int lcore_id, uint32_t index) int power_acpi_cpufreq_freq_down(unsigned int lcore_id) { - struct rte_power_info *pi; + struct acpi_power_info *pi; if (lcore_id >= RTE_MAX_LCORE) { RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); @@ -540,7 +540,7 @@ power_acpi_cpufreq_freq_down(unsigned int lcore_id) int power_acpi_cpufreq_freq_up(unsigned int lcore_id) { - struct rte_power_info *pi; + struct acpi_power_info *pi; if (lcore_id >= RTE_MAX_LCORE) { RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); @@ -581,7 +581,7 @@ power_acpi_cpufreq_freq_max(unsigned int lcore_id) int power_acpi_cpufreq_freq_min(unsigned int lcore_id) { - struct rte_power_info *pi; + struct acpi_power_info *pi; if (lcore_id >= RTE_MAX_LCORE) { RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); @@ -598,7 +598,7 @@ power_acpi_cpufreq_freq_min(unsigned int lcore_id) int power_acpi_turbo_status(unsigned int lcore_id) { - struct rte_power_info *pi; + struct acpi_power_info *pi; if (lcore_id >= RTE_MAX_LCORE) { RTE_LOG(ERR, POWER, "Invalid lcore ID\n"); @@ -614,7 +614,7 @@ power_acpi_turbo_status(unsigned int lcore_id) int power_acpi_enable
[dpdk-dev] [21.08 PATCH v4 2/2] power: refactor pstate and acpi code
Currently, ACPI and PSTATE modes have lots of code duplication, confusing logic, and a bunch of other issues that can, and have, led to various bugs and resource leaks. This commit factors out the common parts of sysfs reading/writing for ACPI and PSTATE drivers. Signed-off-by: Anatoly Burakov --- lib/power/meson.build| 7 + lib/power/power_acpi_cpufreq.c | 178 +++-- lib/power/power_common.c | 133 + lib/power/power_common.h | 46 + lib/power/power_pstate_cpufreq.c | 332 --- 5 files changed, 293 insertions(+), 403 deletions(-) diff --git a/lib/power/meson.build b/lib/power/meson.build index a2cc9fe2ef..85324d48d2 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -5,6 +5,13 @@ if not is_linux build = false reason = 'only supported on Linux' endif + +# we do some snprintf magic so silence format-nonliteral +flag_nonliteral = '-Wno-format-nonliteral' +if cc.has_argument(flag_nonliteral) + cflags += flag_nonliteral +endif + sources = files( 'guest_channel.c', 'power_acpi_cpufreq.c', diff --git a/lib/power/power_acpi_cpufreq.c b/lib/power/power_acpi_cpufreq.c index 1b8c69cc8b..97f1d302c9 100644 --- a/lib/power/power_acpi_cpufreq.c +++ b/lib/power/power_acpi_cpufreq.c @@ -19,41 +19,10 @@ #include "power_acpi_cpufreq.h" #include "power_common.h" -#ifdef RTE_LIBRTE_POWER_DEBUG -#define POWER_DEBUG_TRACE(fmt, args...) do { \ - RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \ -} while (0) -#else -#define POWER_DEBUG_TRACE(fmt, args...) -#endif - -#define FOPEN_OR_ERR_RET(f, retval) do { \ - if ((f) == NULL) { \ - RTE_LOG(ERR, POWER, "File not opened\n"); \ - return retval; \ - } \ -} while (0) - -#define FOPS_OR_NULL_GOTO(ret, label) do { \ - if ((ret) == NULL) { \ - RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \ - goto label; \ - } \ -} while (0) - -#define FOPS_OR_ERR_GOTO(ret, label) do { \ - if ((ret) < 0) { \ - RTE_LOG(ERR, POWER, "File operations failed\n"); \ - goto label; \ - } \ -} while (0) - #define STR_SIZE 1024 #define POWER_CONVERT_TO_DECIMAL 10 #define POWER_GOVERNOR_USERSPACE "userspace" -#define POWER_SYSFILE_GOVERNOR \ - "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor" #define POWER_SYSFILE_AVAIL_FREQ \ "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_available_frequencies" #define POWER_SYSFILE_SETSPEED \ @@ -135,53 +104,18 @@ set_freq_internal(struct acpi_power_info *pi, uint32_t idx) static int power_set_governor_userspace(struct acpi_power_info *pi) { - FILE *f; - int ret = -1; - char buf[BUFSIZ]; - char fullpath[PATH_MAX]; - char *s; - int val; - - snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR, - pi->lcore_id); - f = fopen(fullpath, "rw+"); - FOPEN_OR_ERR_RET(f, ret); - - s = fgets(buf, sizeof(buf), f); - FOPS_OR_NULL_GOTO(s, out); - /* Strip off terminating '\n' */ - strtok(buf, "\n"); - - /* Save the original governor */ - rte_strscpy(pi->governor_ori, buf, sizeof(pi->governor_ori)); - - /* Check if current governor is userspace */ - if (strncmp(buf, POWER_GOVERNOR_USERSPACE, - sizeof(POWER_GOVERNOR_USERSPACE)) == 0) { - ret = 0; - POWER_DEBUG_TRACE("Power management governor of lcore %u is " - "already userspace\n", pi->lcore_id); - goto out; - } - - /* Write 'userspace' to the governor */ - val = fseek(f, 0, SEEK_SET); - FOPS_OR_ERR_GOTO(val, out); - - val = fputs(POWER_GOVERNOR_USERSPACE, f); - FOPS_OR_ERR_GOTO(val, out); - - /* We need to flush to see if the fputs succeeds */ - val = fflush(f); - FOPS_OR_ERR_GOTO(val, out); - - ret = 0; - RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been " - "set to user space successfully\n", pi->lcore_id); -out: - fclose(f); - - return ret; + return power_set_governor(pi->lcore_id, POWER_GOVERNOR_USERSPACE, + pi->governor_ori, sizeof(pi->governor_ori)); +} + +/** + * It is to check the governor and then set the original governor back if + * needed by writing the sys file. + */ +static int +power_set_governor_original(struct acpi_power_info *pi) +{ + return power_set_governor(pi->lcore_id, p
[dpdk-dev] [PATCH v1 2/2] net/i40e: allow get_monitor_addr for VF driver
When .get_monitor_addr API was introduced, it was implemented in the i40e driver, but only for the physical function; the virtual function portion of the driver does not support that API. Add the missing function pointer to VF device structure. The i40e driver is not meant to use the VF portion any more, as currently i40e VF devices are supposed to be managed by iavf drier, but add this just in case it needs backporting later. Fixes: a683abf90a22 ("net/i40e: implement power management API") Signed-off-by: Anatoly Burakov --- drivers/net/i40e/i40e_ethdev_vf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/i40e/i40e_ethdev_vf.c b/drivers/net/i40e/i40e_ethdev_vf.c index 3c258ba7cf..156ad9ab96 100644 --- a/drivers/net/i40e/i40e_ethdev_vf.c +++ b/drivers/net/i40e/i40e_ethdev_vf.c @@ -216,6 +216,7 @@ static const struct eth_dev_ops i40evf_eth_dev_ops = { .mtu_set = i40evf_dev_mtu_set, .mac_addr_set = i40evf_set_default_mac_addr, .tx_done_cleanup = i40e_tx_done_cleanup, + .get_monitor_addr = i40e_get_monitor_addr }; /* -- 2.25.1
[dpdk-dev] [PATCH v1 1/2] net/ixgbe: allow get_monitor_addr for VF driver
When .get_monitor_addr API was introduced, it was implemented in the ixgbe driver, but only for the physical function; the virtual function portion of the driver does not support that API. Add the missing function pointer to VF device structure. Fixes: 3982b7967bb7 ("net/ixgbe: implement power management API") Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/ixgbe_ethdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c index ff65145f55..6cca039a11 100644 --- a/drivers/net/ixgbe/ixgbe_ethdev.c +++ b/drivers/net/ixgbe/ixgbe_ethdev.c @@ -605,6 +605,7 @@ static const struct eth_dev_ops ixgbevf_eth_dev_ops = { .rss_hash_update = ixgbe_dev_rss_hash_update, .rss_hash_conf_get= ixgbe_dev_rss_hash_conf_get, .tx_done_cleanup = ixgbe_dev_tx_done_cleanup, + .get_monitor_addr = ixgbe_get_monitor_addr, }; /* store statistics names and its offset in stats structure */ -- 2.25.1
[dpdk-dev] [PATCH v2 0/7] Enhancements for PMD power management
This patchset introduces several changes related to PMD power management: - Changed monitoring intrinsics to use callbacks as a comparison function, based on previous patchset [1] but incorporating feedback [2] - this hopefully will make it possible to add support for .get_monitor_addr in virtio - Add a new intrinsic to monitor multiple addresses, based on RTM instruction set and the TPAUSE instruction - Add support for PMD power management on multiple queues, as well as all accompanying infrastructure and example apps changes v2: - Changed check inversion to callbacks - Addressed feedback from Konstantin - Added doc updates where necessary [1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=* [2] http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274 Anatoly Burakov (7): power_intrinsics: use callbacks for comparison net/af_xdp: add power monitor support eal: add power monitor for multiple events power: remove thread safety from PMD power API's power: support callbacks for multiple Rx queues power: support monitoring multiple Rx queues l3fwd-power: support multiqueue in PMD pmgmt modes doc/guides/prog_guide/power_man.rst | 83 ++- doc/guides/rel_notes/release_21_08.rst| 11 + drivers/event/dlb2/dlb2.c | 16 +- drivers/net/af_xdp/rte_eth_af_xdp.c | 33 + drivers/net/i40e/i40e_rxtx.c | 19 +- drivers/net/iavf/iavf_rxtx.c | 19 +- drivers/net/ice/ice_rxtx.c| 19 +- drivers/net/ixgbe/ixgbe_rxtx.c| 19 +- drivers/net/mlx5/mlx5_rx.c| 16 +- examples/l3fwd-power/main.c | 39 +- lib/eal/arm/rte_power_intrinsics.c| 11 + lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 64 +- lib/eal/ppc/rte_power_intrinsics.c| 11 + lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 78 ++- lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c| 574 +- lib/power/rte_power_pmd_mgmt.h| 40 ++ lib/power/version.map | 3 + 21 files changed, 841 insertions(+), 224 deletions(-) -- 2.25.1
[dpdk-dev] [PATCH v2 1/7] power_intrinsics: use callbacks for comparison
Previously, the semantics of power monitor were such that we were checking current value against the expected value, and if they matched, then the sleep was aborted. This is somewhat inflexible, because it only allowed us to check for a specific value. This commit replaces the comparison with a user callback mechanism, so that any PMD (or other code) using `rte_power_monitor()` can define their own comparison semantics and decision making on how to detect the need to abort the entering of power optimized state. Existing implementations are adjusted to follow the new semantics. Suggested-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v2: - Use callback mechanism for more flexibility - Address feedback from Konstantin doc/guides/rel_notes/release_21_08.rst| 1 + drivers/event/dlb2/dlb2.c | 16 -- drivers/net/i40e/i40e_rxtx.c | 19 drivers/net/iavf/iavf_rxtx.c | 19 drivers/net/ice/ice_rxtx.c| 19 drivers/net/ixgbe/ixgbe_rxtx.c| 19 drivers/net/mlx5/mlx5_rx.c| 16 -- .../include/generic/rte_power_intrinsics.h| 29 ++- lib/eal/x86/rte_power_intrinsics.c| 9 ++ 9 files changed, 106 insertions(+), 41 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index a6ecfdf3ce..c84ac280f5 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -84,6 +84,7 @@ API Changes Also, make sure to start the actual text at the margin. === +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. ABI Changes --- diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c index eca183753f..14dfac257c 100644 --- a/drivers/event/dlb2/dlb2.c +++ b/drivers/event/dlb2/dlb2.c @@ -3154,6 +3154,15 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num) } } +#define CLB_MASK_IDX 0 +#define CLB_VAL_IDX 1 +static int +dlb2_monitor_callback(const uint64_t val, const uint64_t opaque[4]) +{ + /* abort if the value matches */ + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0; +} + static inline int dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, struct dlb2_eventdev_port *ev_port, @@ -3194,8 +3203,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, expected_value = 0; pmc.addr = monitor_addr; - pmc.val = expected_value; - pmc.mask = qe_mask.raw_qe[1]; + /* store expected value and comparison mask in opaque data */ + pmc.opaque[CLB_VAL_IDX] = expected_value; + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1]; + /* set up callback */ + pmc.fn = dlb2_monitor_callback; pmc.size = sizeof(uint64_t); rte_power_monitor(&pmc, timeout + start_ticks); diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 6c58decece..45f3fbf4ec 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -81,6 +81,17 @@ #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK) +static int +i40e_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* +* we expect the DD bit to be set to 1 if this descriptor was already +* written to. +*/ + return (value & m) == m ? -1 : 0; +} + int i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) { @@ -93,12 +104,8 @@ i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) /* watch for changes in status bit */ pmc->addr = &rxdp->wb.qword1.status_error_len; - /* -* we expect the DD bit to be set to 1 if this descriptor was already -* written to. -*/ - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); - pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* comparison callback */ + pmc->fn = i40e_monitor_callback; /* registers are 64-bit */ pmc->size = sizeof(uint64_t); diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 0361af0d85..6e12ecce07 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -57,6 +57,17 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type) rxdid_map[flex_type] : IAVF_RXDID_COMMS_OVS_1; } +static int +iavf_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unus
[dpdk-dev] [PATCH v2 2/7] net/af_xdp: add power monitor support
Implement support for .get_monitor_addr in AF_XDP driver. Signed-off-by: Anatoly Burakov --- Notes: v2: - Rewrite using the callback mechanism drivers/net/af_xdp/rte_eth_af_xdp.c | 33 + 1 file changed, 33 insertions(+) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index eb5660a3dc..8b9c89c3e8 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "compat.h" @@ -788,6 +789,37 @@ eth_dev_configure(struct rte_eth_dev *dev) return 0; } +#define CLB_VAL_IDX 0 +static int +eth_monitor_callback(const uint64_t value, const uint64_t opaque[4]) +{ + const uint64_t v = opaque[CLB_VAL_IDX]; + const uint64_t m = (uint32_t)~0; + + /* if the value has changed, abort entering power optimized state */ + return (value & m) == v ? 0 : -1; +} + +static int +eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) +{ + struct pkt_rx_queue *rxq = rx_queue; + unsigned int *prod = rxq->rx.producer; + const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */ + + /* watch for changes in producer ring */ + pmc->addr = (void*)prod; + + /* store current value */ + pmc->opaque[CLB_VAL_IDX] = cur_val; + pmc->fn = eth_monitor_callback; + + /* AF_XDP producer ring index is 32-bit */ + pmc->size = sizeof(uint32_t); + + return 0; +} + static int eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { @@ -1448,6 +1480,7 @@ static const struct eth_dev_ops ops = { .link_update = eth_link_update, .stats_get = eth_stats_get, .stats_reset = eth_stats_reset, + .get_monitor_addr = eth_get_monitor_addr }; /** parse busy_budget argument */ -- 2.25.1
[dpdk-dev] [PATCH v2 3/7] eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to what UMWAIT does, but without the limitation of having to listen for just one event. This works because the optimized power state used by the TPAUSE instruction will cause a wake up on RTM transaction abort, so if we add the addresses we're interested in to the read-set, any write to those addresses will wake us up. Signed-off-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v2: - Adapt to callback mechanism doc/guides/rel_notes/release_21_08.rst| 2 + lib/eal/arm/rte_power_intrinsics.c| 11 +++ lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 35 ++ lib/eal/ppc/rte_power_intrinsics.c| 11 +++ lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 69 +++ 8 files changed, 135 insertions(+) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index c84ac280f5..9d1cfac395 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -55,6 +55,8 @@ New Features Also, make sure to start the actual text at the margin. === +* eal: added ``rte_power_monitor_multi`` to support waiting for multiple events. + Removed Items - diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c index e83f04072a..78f55b7203 100644 --- a/lib/eal/arm/rte_power_intrinsics.c +++ b/lib/eal/arm/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index 28a5aecde8..d35551e931 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -24,6 +24,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_monitor function */ uint32_t power_pause : 1; /**< indicates support for rte_power_pause function */ + uint32_t power_monitor_multi : 1; + /**< indicates support for rte_power_monitor_multi function */ }; /** diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h index 046667ade6..877fb282cb 100644 --- a/lib/eal/include/generic/rte_power_intrinsics.h +++ b/lib/eal/include/generic/rte_power_intrinsics.h @@ -124,4 +124,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id); __rte_experimental int rte_power_pause(const uint64_t tsc_timestamp); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Monitor a set of addresses for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either one of the specified + * memory addresses is written to, a certain TSC timestamp is reached, or other + * reasons cause the CPU to wake up. + * + * Additionally, `expected` 64-bit values and 64-bit masks are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they do not match, the entering of + * optimized power state may be aborted. + * + * @warning It is responsibility of the user to check if this function is + * supported at runtime using `rte_cpu_get_intrinsics_support()` API call. + * Failing to do so may result in an illegal CPU instruction error. + * + * @param pmc + * An array of monitoring condition structures. + * @param num + * Length of the `pmc` array. + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * 0 on success + * -EINVAL on invalid parameters + * -ENOTSUP if unsupported + */ +__rte_experimental +int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp); + #endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c index 7fc9586da7..f00b58ade5 100644 --- a/lib/eal/ppc/rte_power_intrinsics.c +++ b/lib/eal/ppc/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); +
[dpdk-dev] [PATCH v2 4/7] power: remove thread safety from PMD power API's
Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. Signed-off-by: Anatoly Burakov --- Notes: v2: - Add check for stopped queue - Clarified doc message - Added release notes doc/guides/rel_notes/release_21_08.rst | 5 + lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c | 133 ++--- lib/power/rte_power_pmd_mgmt.h | 6 ++ 4 files changed, 67 insertions(+), 80 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index 9d1cfac395..f015c509fc 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -88,6 +88,11 @@ API Changes * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. +* rte_power: The experimental PMD power management API is no longer considered + to be thread safe; all Rx queues affected by the API will now need to be + stopped before making any changes to the power management scheme. + + ABI Changes --- diff --git a/lib/power/meson.build b/lib/power/meson.build index c1097d32f1..4f6a242364 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -21,4 +21,7 @@ headers = files( 'rte_power_pmd_mgmt.h', 'rte_power_guest_channel.h', ) +if cc.has_argument('-Wno-cast-qual') +cflags += '-Wno-cast-qual' +endif deps += ['timer', 'ethdev'] diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index db03cbf420..9b95cf1794 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -40,8 +40,6 @@ struct pmd_queue_cfg { /**< Callback mode for this queue */ const struct rte_eth_rxtx_callback *cur_cb; /**< Callback instance */ - volatile bool umwait_in_progress; - /**< are we currently sleeping? */ uint64_t empty_poll_stats; /**< Number of empty polls */ } __rte_cache_aligned; @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, struct rte_power_monitor_cond pmc; uint16_t ret; - /* -* we might get a cancellation request while being -* inside the callback, in which case the wakeup -* wouldn't work because it would've arrived too early. -* -* to get around this, we notify the other thread that -* we're sleeping, so that it can spin until we're done. -* unsolicited wakeups are perfectly safe. -*/ - q_conf->umwait_in_progress = true; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); - - /* check if we need to cancel sleep */ - if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) { - /* use monitoring condition to sleep */ - ret = rte_eth_get_monitor_addr(port_id, qidx, - &pmc); - if (ret == 0) - rte_power_monitor(&pmc, UINT64_MAX); - } - q_conf->umwait_in_progress = false; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); + /* use monitoring condition to sleep */ + ret = rte_eth_get_monitor_addr(port_id, qidx, + &pmc); + if (ret == 0) + rte_power_monitor(&pmc, UINT64_MAX); } } else q_conf->empty_poll_stats = 0; @@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx, return nb_rx; } +static int +queue_stopped(const uint16_t port_id, const uint16_t queue_id) +{ + struct rte_eth_rxq_info qinfo; + + if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) + return -1; + + return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; +} + int rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint
[dpdk-dev] [PATCH v2 6/7] power: support monitoring multiple Rx queues
Use the new multi-monitor intrinsic to allow monitoring multiple ethdev Rx queues while entering the energy efficient power state. The multi version will be used unconditionally if supported, and the UMWAIT one will only be used when multi-monitor is not supported by the hardware. Signed-off-by: Anatoly Burakov --- doc/guides/prog_guide/power_man.rst | 9 ++-- lib/power/rte_power_pmd_mgmt.c | 76 - 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index 38f876466a..defb61bdc4 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number. The "monitor" mode is only supported in the following configurations and scenarios: * If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor_multi()`` function is supported by the platform, then + monitoring multiple Ethernet Rx queues for traffic will be supported. + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only ``rte_power_monitor()`` is supported by the platform, then monitoring will be limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be monitored from a different lcore). -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the - ``rte_power_monitor()`` function is not supported, then monitor mode will not - be supported. +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the + two monitoring functions are supported, then monitor mode will not be supported. * Not all Ethernet devices support monitoring, even if the underlying platform may support the necessary CPU instructions. Support for monitoring is diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index 7762cd39b8..aab2d4f1ee 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q) return 0; } +static inline int +get_monitor_addresses(struct pmd_core_cfg *cfg, + struct rte_power_monitor_cond *pmc) +{ + const struct queue_list_entry *qle; + size_t i = 0; + int ret; + + TAILQ_FOREACH(qle, &cfg->head, next) { + struct rte_power_monitor_cond *cur = &pmc[i]; + const union queue *q = &qle->queue; + ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); + if (ret < 0) + return ret; + } + return 0; +} + static void calc_tsc(void) { @@ -183,6 +201,48 @@ calc_tsc(void) } } +static uint16_t +clb_multiwait(uint16_t port_id, uint16_t qidx, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *addr __rte_unused) +{ + const unsigned int lcore = rte_lcore_id(); + const union queue q = {.portid = port_id, .qid = qidx}; + const bool empty = nb_rx == 0; + struct pmd_core_cfg *q_conf; + + q_conf = &lcore_cfg[lcore]; + + /* early exit */ + if (likely(!empty)) { + q_conf->empty_poll_stats = 0; + } else { + /* do we care about this particular queue? */ + if (!queue_is_power_save(q_conf, &q)) + return nb_rx; + + /* +* we can increment unconditionally here because if there were +* non-empty polls in other queues assigned to this core, we +* dropped the counter to zero anyway. +*/ + q_conf->empty_poll_stats++; + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { + struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS]; + uint16_t ret; + + /* gather all monitoring conditions */ + ret = get_monitor_addresses(q_conf, pmc); + + if (ret == 0) + rte_power_monitor_multi(pmc, + q_conf->n_queues, UINT64_MAX); + } + } + + return nb_rx; +} + static uint16_t clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, uint16_t max_pkts __rte_unused, @@ -348,14 +408,19 @@ static int check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata) { struct rte_power_monitor_cond dummy; + bool multimonitor_supported; /* check if rte_power_monitor is supported */ if (!global_data.intrinsics_support.power_monitor) { RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n"); ret
[dpdk-dev] [PATCH v2 5/7] power: support callbacks for multiple Rx queues
Currently, there is a hard limitation on the PMD power management support that only allows it to support a single queue per lcore. This is not ideal as most DPDK use cases will poll multiple queues per core. The PMD power management mechanism relies on ethdev Rx callbacks, so it is very difficult to implement such support because callbacks are effectively stateless and have no visibility into what the other ethdev devices are doing. This places limitations on what we can do within the framework of Rx callbacks, but the basics of this implementation are as follows: - Replace per-queue structures with per-lcore ones, so that any device polled from the same lcore can share data - Any queue that is going to be polled from a specific lcore has to be added to the list of cores to poll, so that the callback is aware of other queues being polled by the same lcore - Both the empty poll counter and the actual power saving mechanism is shared between all queues polled on a particular lcore, and is only activated when a special designated "power saving" queue is polled. To put it another way, we have no idea which queue the user will poll in what order, so we rely on them telling us that queue X is the last one in the polling loop, so any power management should happen there. - A new API is added to mark a specific Rx queue as "power saving". Failing to call this API will result in no power management, however when having only one queue per core it is obvious which queue is the "power saving" one, so things will still work without this new API for use cases that were previously working without it. - The limitation on UMWAIT-based polling is not removed because UMWAIT is incapable of monitoring more than one address. Signed-off-by: Anatoly Burakov --- Notes: v2: - Use a TAILQ for queues instead of a static array - Address feedback from Konstantin - Add additional checks for stopped queues doc/guides/prog_guide/power_man.rst| 80 -- doc/guides/rel_notes/release_21_08.rst | 3 + lib/power/rte_power_pmd_mgmt.c | 381 - lib/power/rte_power_pmd_mgmt.h | 34 +++ lib/power/version.map | 3 + 5 files changed, 407 insertions(+), 94 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index c70ae128ac..38f876466a 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -198,34 +198,48 @@ Ethernet PMD Power Management API Abstract -Existing power management mechanisms require developers -to change application design or change code to make use of it. -The PMD power management API provides a convenient alternative -by utilizing Ethernet PMD RX callbacks, -and triggering power saving whenever empty poll count reaches a certain number. - -Monitor - This power saving scheme will put the CPU into optimized power state - and use the ``rte_power_monitor()`` function - to monitor the Ethernet PMD RX descriptor address, - and wake the CPU up whenever there's new traffic. - -Pause - This power saving scheme will avoid busy polling - by either entering power-optimized sleep state - with ``rte_power_pause()`` function, - or, if it's not available, use ``rte_pause()``. - -Frequency scaling - This power saving scheme will use ``librte_power`` library - functionality to scale the core frequency up/down - depending on traffic volume. - -.. note:: - - Currently, this power management API is limited to mandatory mapping - of 1 queue to 1 core (multiple queues are supported, - but they must be polled from different cores). +Existing power management mechanisms require developers to change application +design or change code to make use of it. The PMD power management API provides a +convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering +power saving whenever empty poll count reaches a certain number. + +* Monitor + This power saving scheme will put the CPU into optimized power state and + monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever + there's new traffic. Support for this scheme may not be available on all + platforms, and further limitations may apply (see below). + +* Pause + This power saving scheme will avoid busy polling by either entering + power-optimized sleep state with ``rte_power_pause()`` function, or, if it's + not supported by the underlying platform, use ``rte_pause()``. + +* Frequency scaling + This power saving scheme will use ``librte_power`` library functionality to + scale the core frequency up/down depending on traffic volume. + +The "monitor" mode is only supported in the following configurations and scenarios: + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor()`` is supported by the platform, then monitoring w
[dpdk-dev] [PATCH v2 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes
Currently, l3fwd-power enforces the limitation of having one queue per lcore. This is no longer necessary, so remove the limitation, and always mark the last queue in qconf as the power save queue. Signed-off-by: Anatoly Burakov --- examples/l3fwd-power/main.c | 39 +++-- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f8dfed1634..3057c06936 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -2498,6 +2498,27 @@ mode_to_str(enum appmode mode) } } +static void +pmd_pmgmt_set_up(unsigned int lcore, uint16_t portid, uint16_t qid, bool last) +{ + int ret; + + ret = rte_power_ethdev_pmgmt_queue_enable(lcore, portid, + qid, pmgmt_type); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", + ret, portid); + + if (!last) + return; + ret = rte_power_ethdev_pmgmt_queue_set_power_save(lcore, portid, qid); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_set_power_save: err=%d, port=%d\n", + ret, portid); +} + int main(int argc, char **argv) { @@ -2723,12 +2744,6 @@ main(int argc, char **argv) printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); fflush(stdout); - /* PMD power management mode can only do 1 queue per core */ - if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) { - rte_exit(EXIT_FAILURE, - "In PMD power management mode, only one queue per lcore is allowed\n"); - } - /* init RX queues */ for(queue = 0; queue < qconf->n_rx_queue; ++queue) { struct rte_eth_rxconf rxq_conf; @@ -2767,15 +2782,9 @@ main(int argc, char **argv) "Fail to add ptype cb\n"); } - if (app_mode == APP_MODE_PMD_MGMT) { - ret = rte_power_ethdev_pmgmt_queue_enable( - lcore_id, portid, queueid, - pmgmt_type); - if (ret < 0) - rte_exit(EXIT_FAILURE, - "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", - ret, portid); - } + if (app_mode == APP_MODE_PMD_MGMT) + pmd_pmgmt_set_up(lcore_id, portid, queueid, + queue == (qconf->n_rx_queue - 1)); } } -- 2.25.1
[dpdk-dev] [PATCH v3 0/7] Enhancements for PMD power management
This patchset introduces several changes related to PMD power management: - Changed monitoring intrinsics to use callbacks as a comparison function, based on previous patchset [1] but incorporating feedback [2] - this hopefully will make it possible to add support for .get_monitor_addr in virtio - Add a new intrinsic to monitor multiple addresses, based on RTM instruction set and the TPAUSE instruction - Add support for PMD power management on multiple queues, as well as all accompanying infrastructure and example apps changes v3: - Moved some doc updates to NIC features list v2: - Changed check inversion to callbacks - Addressed feedback from Konstantin - Added doc updates where necessary [1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=* [2] http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274 Anatoly Burakov (7): power_intrinsics: use callbacks for comparison net/af_xdp: add power monitor support eal: add power monitor for multiple events power: remove thread safety from PMD power API's power: support callbacks for multiple Rx queues power: support monitoring multiple Rx queues l3fwd-power: support multiqueue in PMD pmgmt modes doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst | 78 ++- doc/guides/rel_notes/release_21_08.rst| 11 + drivers/event/dlb2/dlb2.c | 16 +- drivers/net/af_xdp/rte_eth_af_xdp.c | 33 + drivers/net/i40e/i40e_rxtx.c | 19 +- drivers/net/iavf/iavf_rxtx.c | 19 +- drivers/net/ice/ice_rxtx.c| 19 +- drivers/net/ixgbe/ixgbe_rxtx.c| 19 +- drivers/net/mlx5/mlx5_rx.c| 16 +- examples/l3fwd-power/main.c | 39 +- lib/eal/arm/rte_power_intrinsics.c| 11 + lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 64 +- lib/eal/ppc/rte_power_intrinsics.c| 11 + lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 78 ++- lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c| 574 +- lib/power/rte_power_pmd_mgmt.h| 40 ++ lib/power/version.map | 3 + 22 files changed, 846 insertions(+), 224 deletions(-) -- 2.25.1
[dpdk-dev] [PATCH v3 1/7] power_intrinsics: use callbacks for comparison
Previously, the semantics of power monitor were such that we were checking current value against the expected value, and if they matched, then the sleep was aborted. This is somewhat inflexible, because it only allowed us to check for a specific value. This commit replaces the comparison with a user callback mechanism, so that any PMD (or other code) using `rte_power_monitor()` can define their own comparison semantics and decision making on how to detect the need to abort the entering of power optimized state. Existing implementations are adjusted to follow the new semantics. Suggested-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v2: - Use callback mechanism for more flexibility - Address feedback from Konstantin doc/guides/rel_notes/release_21_08.rst| 1 + drivers/event/dlb2/dlb2.c | 16 -- drivers/net/i40e/i40e_rxtx.c | 19 drivers/net/iavf/iavf_rxtx.c | 19 drivers/net/ice/ice_rxtx.c| 19 drivers/net/ixgbe/ixgbe_rxtx.c| 19 drivers/net/mlx5/mlx5_rx.c| 16 -- .../include/generic/rte_power_intrinsics.h| 29 ++- lib/eal/x86/rte_power_intrinsics.c| 9 ++ 9 files changed, 106 insertions(+), 41 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index a6ecfdf3ce..c84ac280f5 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -84,6 +84,7 @@ API Changes Also, make sure to start the actual text at the margin. === +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. ABI Changes --- diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c index eca183753f..14dfac257c 100644 --- a/drivers/event/dlb2/dlb2.c +++ b/drivers/event/dlb2/dlb2.c @@ -3154,6 +3154,15 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num) } } +#define CLB_MASK_IDX 0 +#define CLB_VAL_IDX 1 +static int +dlb2_monitor_callback(const uint64_t val, const uint64_t opaque[4]) +{ + /* abort if the value matches */ + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0; +} + static inline int dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, struct dlb2_eventdev_port *ev_port, @@ -3194,8 +3203,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, expected_value = 0; pmc.addr = monitor_addr; - pmc.val = expected_value; - pmc.mask = qe_mask.raw_qe[1]; + /* store expected value and comparison mask in opaque data */ + pmc.opaque[CLB_VAL_IDX] = expected_value; + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1]; + /* set up callback */ + pmc.fn = dlb2_monitor_callback; pmc.size = sizeof(uint64_t); rte_power_monitor(&pmc, timeout + start_ticks); diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 6c58decece..45f3fbf4ec 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -81,6 +81,17 @@ #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK) +static int +i40e_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* +* we expect the DD bit to be set to 1 if this descriptor was already +* written to. +*/ + return (value & m) == m ? -1 : 0; +} + int i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) { @@ -93,12 +104,8 @@ i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) /* watch for changes in status bit */ pmc->addr = &rxdp->wb.qword1.status_error_len; - /* -* we expect the DD bit to be set to 1 if this descriptor was already -* written to. -*/ - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); - pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* comparison callback */ + pmc->fn = i40e_monitor_callback; /* registers are 64-bit */ pmc->size = sizeof(uint64_t); diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 0361af0d85..6e12ecce07 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -57,6 +57,17 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type) rxdid_map[flex_type] : IAVF_RXDID_COMMS_OVS_1; } +static int +iavf_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unus
[dpdk-dev] [PATCH v3 2/7] net/af_xdp: add power monitor support
Implement support for .get_monitor_addr in AF_XDP driver. Signed-off-by: Anatoly Burakov --- Notes: v2: - Rewrite using the callback mechanism drivers/net/af_xdp/rte_eth_af_xdp.c | 33 + 1 file changed, 33 insertions(+) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index eb5660a3dc..8b9c89c3e8 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "compat.h" @@ -788,6 +789,37 @@ eth_dev_configure(struct rte_eth_dev *dev) return 0; } +#define CLB_VAL_IDX 0 +static int +eth_monitor_callback(const uint64_t value, const uint64_t opaque[4]) +{ + const uint64_t v = opaque[CLB_VAL_IDX]; + const uint64_t m = (uint32_t)~0; + + /* if the value has changed, abort entering power optimized state */ + return (value & m) == v ? 0 : -1; +} + +static int +eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) +{ + struct pkt_rx_queue *rxq = rx_queue; + unsigned int *prod = rxq->rx.producer; + const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */ + + /* watch for changes in producer ring */ + pmc->addr = (void*)prod; + + /* store current value */ + pmc->opaque[CLB_VAL_IDX] = cur_val; + pmc->fn = eth_monitor_callback; + + /* AF_XDP producer ring index is 32-bit */ + pmc->size = sizeof(uint32_t); + + return 0; +} + static int eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { @@ -1448,6 +1480,7 @@ static const struct eth_dev_ops ops = { .link_update = eth_link_update, .stats_get = eth_stats_get, .stats_reset = eth_stats_reset, + .get_monitor_addr = eth_get_monitor_addr }; /** parse busy_budget argument */ -- 2.25.1
[dpdk-dev] [PATCH v3 3/7] eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to what UMWAIT does, but without the limitation of having to listen for just one event. This works because the optimized power state used by the TPAUSE instruction will cause a wake up on RTM transaction abort, so if we add the addresses we're interested in to the read-set, any write to those addresses will wake us up. Signed-off-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v2: - Adapt to callback mechanism doc/guides/rel_notes/release_21_08.rst| 2 + lib/eal/arm/rte_power_intrinsics.c| 11 +++ lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 35 ++ lib/eal/ppc/rte_power_intrinsics.c| 11 +++ lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 69 +++ 8 files changed, 135 insertions(+) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index c84ac280f5..9d1cfac395 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -55,6 +55,8 @@ New Features Also, make sure to start the actual text at the margin. === +* eal: added ``rte_power_monitor_multi`` to support waiting for multiple events. + Removed Items - diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c index e83f04072a..78f55b7203 100644 --- a/lib/eal/arm/rte_power_intrinsics.c +++ b/lib/eal/arm/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index 28a5aecde8..d35551e931 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -24,6 +24,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_monitor function */ uint32_t power_pause : 1; /**< indicates support for rte_power_pause function */ + uint32_t power_monitor_multi : 1; + /**< indicates support for rte_power_monitor_multi function */ }; /** diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h index 046667ade6..877fb282cb 100644 --- a/lib/eal/include/generic/rte_power_intrinsics.h +++ b/lib/eal/include/generic/rte_power_intrinsics.h @@ -124,4 +124,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id); __rte_experimental int rte_power_pause(const uint64_t tsc_timestamp); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Monitor a set of addresses for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either one of the specified + * memory addresses is written to, a certain TSC timestamp is reached, or other + * reasons cause the CPU to wake up. + * + * Additionally, `expected` 64-bit values and 64-bit masks are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they do not match, the entering of + * optimized power state may be aborted. + * + * @warning It is responsibility of the user to check if this function is + * supported at runtime using `rte_cpu_get_intrinsics_support()` API call. + * Failing to do so may result in an illegal CPU instruction error. + * + * @param pmc + * An array of monitoring condition structures. + * @param num + * Length of the `pmc` array. + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * 0 on success + * -EINVAL on invalid parameters + * -ENOTSUP if unsupported + */ +__rte_experimental +int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp); + #endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c index 7fc9586da7..f00b58ade5 100644 --- a/lib/eal/ppc/rte_power_intrinsics.c +++ b/lib/eal/ppc/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); +
[dpdk-dev] [PATCH v3 4/7] power: remove thread safety from PMD power API's
Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. Signed-off-by: Anatoly Burakov --- Notes: v2: - Add check for stopped queue - Clarified doc message - Added release notes doc/guides/rel_notes/release_21_08.rst | 5 + lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c | 133 ++--- lib/power/rte_power_pmd_mgmt.h | 6 ++ 4 files changed, 67 insertions(+), 80 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index 9d1cfac395..f015c509fc 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -88,6 +88,11 @@ API Changes * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. +* rte_power: The experimental PMD power management API is no longer considered + to be thread safe; all Rx queues affected by the API will now need to be + stopped before making any changes to the power management scheme. + + ABI Changes --- diff --git a/lib/power/meson.build b/lib/power/meson.build index c1097d32f1..4f6a242364 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -21,4 +21,7 @@ headers = files( 'rte_power_pmd_mgmt.h', 'rte_power_guest_channel.h', ) +if cc.has_argument('-Wno-cast-qual') +cflags += '-Wno-cast-qual' +endif deps += ['timer', 'ethdev'] diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index db03cbf420..9b95cf1794 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -40,8 +40,6 @@ struct pmd_queue_cfg { /**< Callback mode for this queue */ const struct rte_eth_rxtx_callback *cur_cb; /**< Callback instance */ - volatile bool umwait_in_progress; - /**< are we currently sleeping? */ uint64_t empty_poll_stats; /**< Number of empty polls */ } __rte_cache_aligned; @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, struct rte_power_monitor_cond pmc; uint16_t ret; - /* -* we might get a cancellation request while being -* inside the callback, in which case the wakeup -* wouldn't work because it would've arrived too early. -* -* to get around this, we notify the other thread that -* we're sleeping, so that it can spin until we're done. -* unsolicited wakeups are perfectly safe. -*/ - q_conf->umwait_in_progress = true; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); - - /* check if we need to cancel sleep */ - if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) { - /* use monitoring condition to sleep */ - ret = rte_eth_get_monitor_addr(port_id, qidx, - &pmc); - if (ret == 0) - rte_power_monitor(&pmc, UINT64_MAX); - } - q_conf->umwait_in_progress = false; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); + /* use monitoring condition to sleep */ + ret = rte_eth_get_monitor_addr(port_id, qidx, + &pmc); + if (ret == 0) + rte_power_monitor(&pmc, UINT64_MAX); } } else q_conf->empty_poll_stats = 0; @@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx, return nb_rx; } +static int +queue_stopped(const uint16_t port_id, const uint16_t queue_id) +{ + struct rte_eth_rxq_info qinfo; + + if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) + return -1; + + return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; +} + int rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint
[dpdk-dev] [PATCH v3 5/7] power: support callbacks for multiple Rx queues
Currently, there is a hard limitation on the PMD power management support that only allows it to support a single queue per lcore. This is not ideal as most DPDK use cases will poll multiple queues per core. The PMD power management mechanism relies on ethdev Rx callbacks, so it is very difficult to implement such support because callbacks are effectively stateless and have no visibility into what the other ethdev devices are doing. This places limitations on what we can do within the framework of Rx callbacks, but the basics of this implementation are as follows: - Replace per-queue structures with per-lcore ones, so that any device polled from the same lcore can share data - Any queue that is going to be polled from a specific lcore has to be added to the list of cores to poll, so that the callback is aware of other queues being polled by the same lcore - Both the empty poll counter and the actual power saving mechanism is shared between all queues polled on a particular lcore, and is only activated when a special designated "power saving" queue is polled. To put it another way, we have no idea which queue the user will poll in what order, so we rely on them telling us that queue X is the last one in the polling loop, so any power management should happen there. - A new API is added to mark a specific Rx queue as "power saving". Failing to call this API will result in no power management, however when having only one queue per core it is obvious which queue is the "power saving" one, so things will still work without this new API for use cases that were previously working without it. - The limitation on UMWAIT-based polling is not removed because UMWAIT is incapable of monitoring more than one address. Also, while we're at it, update and improve the docs. Signed-off-by: Anatoly Burakov --- Notes: v3: - Move the list of supported NICs to NIC feature table v2: - Use a TAILQ for queues instead of a static array - Address feedback from Konstantin - Add additional checks for stopped queues doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst| 75 +++-- doc/guides/rel_notes/release_21_08.rst | 3 + lib/power/rte_power_pmd_mgmt.c | 381 - lib/power/rte_power_pmd_mgmt.h | 34 +++ lib/power/version.map | 3 + 6 files changed, 412 insertions(+), 94 deletions(-) diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst index 403c2b03a3..a96e12d155 100644 --- a/doc/guides/nics/features.rst +++ b/doc/guides/nics/features.rst @@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information. * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``. * **[related] API**: ``rte_eth_rx_burst_mode_get()``, ``rte_eth_tx_burst_mode_get()``. +.. _nic_features_get_monitor_addr: + +PMD power management using monitor addresses + + +Supports getting a monitoring condition to use together with Ethernet PMD power +management (see :doc:`../prog_guide/power_man` for more details). + +* **[implements] eth_dev_ops**: ``get_monitor_addr`` + .. _nic_features_other: Other dev ops not represented by a Feature diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index c70ae128ac..fac2c19516 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -198,34 +198,41 @@ Ethernet PMD Power Management API Abstract -Existing power management mechanisms require developers -to change application design or change code to make use of it. -The PMD power management API provides a convenient alternative -by utilizing Ethernet PMD RX callbacks, -and triggering power saving whenever empty poll count reaches a certain number. - -Monitor - This power saving scheme will put the CPU into optimized power state - and use the ``rte_power_monitor()`` function - to monitor the Ethernet PMD RX descriptor address, - and wake the CPU up whenever there's new traffic. - -Pause - This power saving scheme will avoid busy polling - by either entering power-optimized sleep state - with ``rte_power_pause()`` function, - or, if it's not available, use ``rte_pause()``. - -Frequency scaling - This power saving scheme will use ``librte_power`` library - functionality to scale the core frequency up/down - depending on traffic volume. - -.. note:: - - Currently, this power management API is limited to mandatory mapping - of 1 queue to 1 core (multiple queues are supported, - but they must be polled from different cores). +Existing power management mechanisms require developers to change application +design or change code to make use of it. The PMD power management API provides a +convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering +power savin
[dpdk-dev] [PATCH v3 6/7] power: support monitoring multiple Rx queues
Use the new multi-monitor intrinsic to allow monitoring multiple ethdev Rx queues while entering the energy efficient power state. The multi version will be used unconditionally if supported, and the UMWAIT one will only be used when multi-monitor is not supported by the hardware. Signed-off-by: Anatoly Burakov --- doc/guides/prog_guide/power_man.rst | 9 ++-- lib/power/rte_power_pmd_mgmt.c | 76 - 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index fac2c19516..3245a5ebed 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number. The "monitor" mode is only supported in the following configurations and scenarios: * If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor_multi()`` function is supported by the platform, then + monitoring multiple Ethernet Rx queues for traffic will be supported. + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only ``rte_power_monitor()`` is supported by the platform, then monitoring will be limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be monitored from a different lcore). -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the - ``rte_power_monitor()`` function is not supported, then monitor mode will not - be supported. +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the + two monitoring functions are supported, then monitor mode will not be supported. * Not all Ethernet devices support monitoring, even if the underlying platform may support the necessary CPU instructions. Please refer to diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index 7762cd39b8..aab2d4f1ee 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q) return 0; } +static inline int +get_monitor_addresses(struct pmd_core_cfg *cfg, + struct rte_power_monitor_cond *pmc) +{ + const struct queue_list_entry *qle; + size_t i = 0; + int ret; + + TAILQ_FOREACH(qle, &cfg->head, next) { + struct rte_power_monitor_cond *cur = &pmc[i]; + const union queue *q = &qle->queue; + ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); + if (ret < 0) + return ret; + } + return 0; +} + static void calc_tsc(void) { @@ -183,6 +201,48 @@ calc_tsc(void) } } +static uint16_t +clb_multiwait(uint16_t port_id, uint16_t qidx, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *addr __rte_unused) +{ + const unsigned int lcore = rte_lcore_id(); + const union queue q = {.portid = port_id, .qid = qidx}; + const bool empty = nb_rx == 0; + struct pmd_core_cfg *q_conf; + + q_conf = &lcore_cfg[lcore]; + + /* early exit */ + if (likely(!empty)) { + q_conf->empty_poll_stats = 0; + } else { + /* do we care about this particular queue? */ + if (!queue_is_power_save(q_conf, &q)) + return nb_rx; + + /* +* we can increment unconditionally here because if there were +* non-empty polls in other queues assigned to this core, we +* dropped the counter to zero anyway. +*/ + q_conf->empty_poll_stats++; + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { + struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS]; + uint16_t ret; + + /* gather all monitoring conditions */ + ret = get_monitor_addresses(q_conf, pmc); + + if (ret == 0) + rte_power_monitor_multi(pmc, + q_conf->n_queues, UINT64_MAX); + } + } + + return nb_rx; +} + static uint16_t clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, uint16_t max_pkts __rte_unused, @@ -348,14 +408,19 @@ static int check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata) { struct rte_power_monitor_cond dummy; + bool multimonitor_supported; /* check if rte_power_monitor is supported */ if (!global_data.intrinsics_support.power_monitor) { RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n"); ret
[dpdk-dev] [PATCH v3 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes
Currently, l3fwd-power enforces the limitation of having one queue per lcore. This is no longer necessary, so remove the limitation, and always mark the last queue in qconf as the power save queue. Signed-off-by: Anatoly Burakov --- examples/l3fwd-power/main.c | 39 +++-- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f8dfed1634..3057c06936 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -2498,6 +2498,27 @@ mode_to_str(enum appmode mode) } } +static void +pmd_pmgmt_set_up(unsigned int lcore, uint16_t portid, uint16_t qid, bool last) +{ + int ret; + + ret = rte_power_ethdev_pmgmt_queue_enable(lcore, portid, + qid, pmgmt_type); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", + ret, portid); + + if (!last) + return; + ret = rte_power_ethdev_pmgmt_queue_set_power_save(lcore, portid, qid); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_set_power_save: err=%d, port=%d\n", + ret, portid); +} + int main(int argc, char **argv) { @@ -2723,12 +2744,6 @@ main(int argc, char **argv) printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); fflush(stdout); - /* PMD power management mode can only do 1 queue per core */ - if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) { - rte_exit(EXIT_FAILURE, - "In PMD power management mode, only one queue per lcore is allowed\n"); - } - /* init RX queues */ for(queue = 0; queue < qconf->n_rx_queue; ++queue) { struct rte_eth_rxconf rxq_conf; @@ -2767,15 +2782,9 @@ main(int argc, char **argv) "Fail to add ptype cb\n"); } - if (app_mode == APP_MODE_PMD_MGMT) { - ret = rte_power_ethdev_pmgmt_queue_enable( - lcore_id, portid, queueid, - pmgmt_type); - if (ret < 0) - rte_exit(EXIT_FAILURE, - "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", - ret, portid); - } + if (app_mode == APP_MODE_PMD_MGMT) + pmd_pmgmt_set_up(lcore_id, portid, queueid, + queue == (qconf->n_rx_queue - 1)); } } -- 2.25.1
[dpdk-dev] [PATCH v4 0/7] Enhancements for PMD power management
This patchset introduces several changes related to PMD power management: - Changed monitoring intrinsics to use callbacks as a comparison function, based on previous patchset [1] but incorporating feedback [2] - this hopefully will make it possible to add support for .get_monitor_addr in virtio - Add a new intrinsic to monitor multiple addresses, based on RTM instruction set and the TPAUSE instruction - Add support for PMD power management on multiple queues, as well as all accompanying infrastructure and example apps changes v4: - Replaced raw number with a macro - Fixed all the bugs found by Konstantin - Some other minor corrections v3: - Moved some doc updates to NIC features list v2: - Changed check inversion to callbacks - Addressed feedback from Konstantin - Added doc updates where necessary [1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=* [2] http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274 Anatoly Burakov (7): power_intrinsics: use callbacks for comparison net/af_xdp: add power monitor support eal: add power monitor for multiple events power: remove thread safety from PMD power API's power: support callbacks for multiple Rx queues power: support monitoring multiple Rx queues l3fwd-power: support multiqueue in PMD pmgmt modes doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst | 78 ++- doc/guides/rel_notes/release_21_08.rst| 11 + drivers/event/dlb2/dlb2.c | 17 +- drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + drivers/net/i40e/i40e_rxtx.c | 20 +- drivers/net/iavf/iavf_rxtx.c | 20 +- drivers/net/ice/ice_rxtx.c| 20 +- drivers/net/ixgbe/ixgbe_rxtx.c| 20 +- drivers/net/mlx5/mlx5_rx.c| 17 +- examples/l3fwd-power/main.c | 39 +- lib/eal/arm/rte_power_intrinsics.c| 11 + lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 68 +- lib/eal/ppc/rte_power_intrinsics.c| 11 + lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 90 ++- lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c| 582 +- lib/power/rte_power_pmd_mgmt.h| 40 ++ lib/power/version.map | 3 + 22 files changed, 874 insertions(+), 227 deletions(-) -- 2.25.1
[dpdk-dev] [PATCH v4 1/7] power_intrinsics: use callbacks for comparison
Previously, the semantics of power monitor were such that we were checking current value against the expected value, and if they matched, then the sleep was aborted. This is somewhat inflexible, because it only allowed us to check for a specific value in a specific way. This commit replaces the comparison with a user callback mechanism, so that any PMD (or other code) using `rte_power_monitor()` can define their own comparison semantics and decision making on how to detect the need to abort the entering of power optimized state. Existing implementations are adjusted to follow the new semantics. Suggested-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov Acked-by: Konstantin Ananyev --- Notes: v4: - Return error if callback is set to NULL - Replace raw number with a macro in monitor condition opaque data v2: - Use callback mechanism for more flexibility - Address feedback from Konstantin doc/guides/rel_notes/release_21_08.rst| 1 + drivers/event/dlb2/dlb2.c | 17 -- drivers/net/i40e/i40e_rxtx.c | 20 +++ drivers/net/iavf/iavf_rxtx.c | 20 +++ drivers/net/ice/ice_rxtx.c| 20 +++ drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++ drivers/net/mlx5/mlx5_rx.c| 17 -- .../include/generic/rte_power_intrinsics.h| 33 +++ lib/eal/x86/rte_power_intrinsics.c| 17 +- 9 files changed, 121 insertions(+), 44 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index a6ecfdf3ce..c84ac280f5 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -84,6 +84,7 @@ API Changes Also, make sure to start the actual text at the margin. === +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. ABI Changes --- diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c index eca183753f..252bbd8d5e 100644 --- a/drivers/event/dlb2/dlb2.c +++ b/drivers/event/dlb2/dlb2.c @@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num) } } +#define CLB_MASK_IDX 0 +#define CLB_VAL_IDX 1 +static int +dlb2_monitor_callback(const uint64_t val, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + /* abort if the value matches */ + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0; +} + static inline int dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, struct dlb2_eventdev_port *ev_port, @@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, expected_value = 0; pmc.addr = monitor_addr; - pmc.val = expected_value; - pmc.mask = qe_mask.raw_qe[1]; + /* store expected value and comparison mask in opaque data */ + pmc.opaque[CLB_VAL_IDX] = expected_value; + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1]; + /* set up callback */ + pmc.fn = dlb2_monitor_callback; pmc.size = sizeof(uint64_t); rte_power_monitor(&pmc, timeout + start_ticks); diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 6c58decece..081682f88b 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -81,6 +81,18 @@ #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK) +static int +i40e_monitor_callback(const uint64_t value, + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* +* we expect the DD bit to be set to 1 if this descriptor was already +* written to. +*/ + return (value & m) == m ? -1 : 0; +} + int i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) { @@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) /* watch for changes in status bit */ pmc->addr = &rxdp->wb.qword1.status_error_len; - /* -* we expect the DD bit to be set to 1 if this descriptor was already -* written to. -*/ - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); - pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* comparison callback */ + pmc->fn = i40e_monitor_callback; /* registers are 64-bit */ pmc->size = sizeof(uint64_t); diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 0361af0d85..7ed196ec22 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rx
[dpdk-dev] [PATCH v4 2/7] net/af_xdp: add power monitor support
Implement support for .get_monitor_addr in AF_XDP driver. Signed-off-by: Anatoly Burakov --- Notes: v2: - Rewrite using the callback mechanism drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + 1 file changed, 34 insertions(+) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index eb5660a3dc..7830d0c23a 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "compat.h" @@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev) return 0; } +#define CLB_VAL_IDX 0 +static int +eth_monitor_callback(const uint64_t value, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + const uint64_t v = opaque[CLB_VAL_IDX]; + const uint64_t m = (uint32_t)~0; + + /* if the value has changed, abort entering power optimized state */ + return (value & m) == v ? 0 : -1; +} + +static int +eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) +{ + struct pkt_rx_queue *rxq = rx_queue; + unsigned int *prod = rxq->rx.producer; + const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */ + + /* watch for changes in producer ring */ + pmc->addr = (void*)prod; + + /* store current value */ + pmc->opaque[CLB_VAL_IDX] = cur_val; + pmc->fn = eth_monitor_callback; + + /* AF_XDP producer ring index is 32-bit */ + pmc->size = sizeof(uint32_t); + + return 0; +} + static int eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { @@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = { .link_update = eth_link_update, .stats_get = eth_stats_get, .stats_reset = eth_stats_reset, + .get_monitor_addr = eth_get_monitor_addr }; /** parse busy_budget argument */ -- 2.25.1
[dpdk-dev] [PATCH v4 3/7] eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to what UMWAIT does, but without the limitation of having to listen for just one event. This works because the optimized power state used by the TPAUSE instruction will cause a wake up on RTM transaction abort, so if we add the addresses we're interested in to the read-set, any write to those addresses will wake us up. Signed-off-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v4: - Fixed bugs in accessing the monitor condition - Abort on any monitor condition not having a defined callback v2: - Adapt to callback mechanism doc/guides/rel_notes/release_21_08.rst| 2 + lib/eal/arm/rte_power_intrinsics.c| 11 +++ lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 35 + lib/eal/ppc/rte_power_intrinsics.c| 11 +++ lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 73 +++ 8 files changed, 139 insertions(+) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index c84ac280f5..9d1cfac395 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -55,6 +55,8 @@ New Features Also, make sure to start the actual text at the margin. === +* eal: added ``rte_power_monitor_multi`` to support waiting for multiple events. + Removed Items - diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c index e83f04072a..78f55b7203 100644 --- a/lib/eal/arm/rte_power_intrinsics.c +++ b/lib/eal/arm/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index 28a5aecde8..d35551e931 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -24,6 +24,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_monitor function */ uint32_t power_pause : 1; /**< indicates support for rte_power_pause function */ + uint32_t power_monitor_multi : 1; + /**< indicates support for rte_power_monitor_multi function */ }; /** diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h index c9aa52a86d..04e8c2ab37 100644 --- a/lib/eal/include/generic/rte_power_intrinsics.h +++ b/lib/eal/include/generic/rte_power_intrinsics.h @@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id); __rte_experimental int rte_power_pause(const uint64_t tsc_timestamp); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Monitor a set of addresses for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either one of the specified + * memory addresses is written to, a certain TSC timestamp is reached, or other + * reasons cause the CPU to wake up. + * + * Additionally, `expected` 64-bit values and 64-bit masks are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they do not match, the entering of + * optimized power state may be aborted. + * + * @warning It is responsibility of the user to check if this function is + * supported at runtime using `rte_cpu_get_intrinsics_support()` API call. + * Failing to do so may result in an illegal CPU instruction error. + * + * @param pmc + * An array of monitoring condition structures. + * @param num + * Length of the `pmc` array. + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * 0 on success + * -EINVAL on invalid parameters + * -ENOTSUP if unsupported + */ +__rte_experimental +int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp); + #endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c index 7fc9586da7..f00b58ade5 100644 --- a/lib/eal/ppc/rte_power_intrinsics.c +++ b/lib/eal/ppc/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], +
[dpdk-dev] [PATCH v4 4/7] power: remove thread safety from PMD power API's
Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. Signed-off-by: Anatoly Burakov --- Notes: v2: - Add check for stopped queue - Clarified doc message - Added release notes doc/guides/rel_notes/release_21_08.rst | 5 + lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c | 133 ++--- lib/power/rte_power_pmd_mgmt.h | 6 ++ 4 files changed, 67 insertions(+), 80 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index 9d1cfac395..f015c509fc 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -88,6 +88,11 @@ API Changes * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. +* rte_power: The experimental PMD power management API is no longer considered + to be thread safe; all Rx queues affected by the API will now need to be + stopped before making any changes to the power management scheme. + + ABI Changes --- diff --git a/lib/power/meson.build b/lib/power/meson.build index c1097d32f1..4f6a242364 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -21,4 +21,7 @@ headers = files( 'rte_power_pmd_mgmt.h', 'rte_power_guest_channel.h', ) +if cc.has_argument('-Wno-cast-qual') +cflags += '-Wno-cast-qual' +endif deps += ['timer', 'ethdev'] diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index db03cbf420..9b95cf1794 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -40,8 +40,6 @@ struct pmd_queue_cfg { /**< Callback mode for this queue */ const struct rte_eth_rxtx_callback *cur_cb; /**< Callback instance */ - volatile bool umwait_in_progress; - /**< are we currently sleeping? */ uint64_t empty_poll_stats; /**< Number of empty polls */ } __rte_cache_aligned; @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, struct rte_power_monitor_cond pmc; uint16_t ret; - /* -* we might get a cancellation request while being -* inside the callback, in which case the wakeup -* wouldn't work because it would've arrived too early. -* -* to get around this, we notify the other thread that -* we're sleeping, so that it can spin until we're done. -* unsolicited wakeups are perfectly safe. -*/ - q_conf->umwait_in_progress = true; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); - - /* check if we need to cancel sleep */ - if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) { - /* use monitoring condition to sleep */ - ret = rte_eth_get_monitor_addr(port_id, qidx, - &pmc); - if (ret == 0) - rte_power_monitor(&pmc, UINT64_MAX); - } - q_conf->umwait_in_progress = false; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); + /* use monitoring condition to sleep */ + ret = rte_eth_get_monitor_addr(port_id, qidx, + &pmc); + if (ret == 0) + rte_power_monitor(&pmc, UINT64_MAX); } } else q_conf->empty_poll_stats = 0; @@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx, return nb_rx; } +static int +queue_stopped(const uint16_t port_id, const uint16_t queue_id) +{ + struct rte_eth_rxq_info qinfo; + + if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) + return -1; + + return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; +} + int rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint
[dpdk-dev] [PATCH v4 5/7] power: support callbacks for multiple Rx queues
Currently, there is a hard limitation on the PMD power management support that only allows it to support a single queue per lcore. This is not ideal as most DPDK use cases will poll multiple queues per core. The PMD power management mechanism relies on ethdev Rx callbacks, so it is very difficult to implement such support because callbacks are effectively stateless and have no visibility into what the other ethdev devices are doing. This places limitations on what we can do within the framework of Rx callbacks, but the basics of this implementation are as follows: - Replace per-queue structures with per-lcore ones, so that any device polled from the same lcore can share data - Any queue that is going to be polled from a specific lcore has to be added to the list of cores to poll, so that the callback is aware of other queues being polled by the same lcore - Both the empty poll counter and the actual power saving mechanism is shared between all queues polled on a particular lcore, and is only activated when a special designated "power saving" queue is polled. To put it another way, we have no idea which queue the user will poll in what order, so we rely on them telling us that queue X is the last one in the polling loop, so any power management should happen there. - A new API is added to mark a specific Rx queue as "power saving". Failing to call this API will result in no power management, however when having only one queue per core it is obvious which queue is the "power saving" one, so things will still work without this new API for use cases that were previously working without it. - The limitation on UMWAIT-based polling is not removed because UMWAIT is incapable of monitoring more than one address. Also, while we're at it, update and improve the docs. Signed-off-by: Anatoly Burakov --- Notes: v3: - Move the list of supported NICs to NIC feature table v2: - Use a TAILQ for queues instead of a static array - Address feedback from Konstantin - Add additional checks for stopped queues doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst| 75 +++-- doc/guides/rel_notes/release_21_08.rst | 3 + lib/power/rte_power_pmd_mgmt.c | 381 - lib/power/rte_power_pmd_mgmt.h | 34 +++ lib/power/version.map | 3 + 6 files changed, 412 insertions(+), 94 deletions(-) diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst index 403c2b03a3..a96e12d155 100644 --- a/doc/guides/nics/features.rst +++ b/doc/guides/nics/features.rst @@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information. * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``. * **[related] API**: ``rte_eth_rx_burst_mode_get()``, ``rte_eth_tx_burst_mode_get()``. +.. _nic_features_get_monitor_addr: + +PMD power management using monitor addresses + + +Supports getting a monitoring condition to use together with Ethernet PMD power +management (see :doc:`../prog_guide/power_man` for more details). + +* **[implements] eth_dev_ops**: ``get_monitor_addr`` + .. _nic_features_other: Other dev ops not represented by a Feature diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index c70ae128ac..fac2c19516 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -198,34 +198,41 @@ Ethernet PMD Power Management API Abstract -Existing power management mechanisms require developers -to change application design or change code to make use of it. -The PMD power management API provides a convenient alternative -by utilizing Ethernet PMD RX callbacks, -and triggering power saving whenever empty poll count reaches a certain number. - -Monitor - This power saving scheme will put the CPU into optimized power state - and use the ``rte_power_monitor()`` function - to monitor the Ethernet PMD RX descriptor address, - and wake the CPU up whenever there's new traffic. - -Pause - This power saving scheme will avoid busy polling - by either entering power-optimized sleep state - with ``rte_power_pause()`` function, - or, if it's not available, use ``rte_pause()``. - -Frequency scaling - This power saving scheme will use ``librte_power`` library - functionality to scale the core frequency up/down - depending on traffic volume. - -.. note:: - - Currently, this power management API is limited to mandatory mapping - of 1 queue to 1 core (multiple queues are supported, - but they must be polled from different cores). +Existing power management mechanisms require developers to change application +design or change code to make use of it. The PMD power management API provides a +convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering +power savin
[dpdk-dev] [PATCH v4 6/7] power: support monitoring multiple Rx queues
Use the new multi-monitor intrinsic to allow monitoring multiple ethdev Rx queues while entering the energy efficient power state. The multi version will be used unconditionally if supported, and the UMWAIT one will only be used when multi-monitor is not supported by the hardware. Signed-off-by: Anatoly Burakov --- Notes: v4: - Fix possible out of bounds access - Added missing index increment doc/guides/prog_guide/power_man.rst | 9 ++-- lib/power/rte_power_pmd_mgmt.c | 84 - 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index fac2c19516..3245a5ebed 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number. The "monitor" mode is only supported in the following configurations and scenarios: * If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor_multi()`` function is supported by the platform, then + monitoring multiple Ethernet Rx queues for traffic will be supported. + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only ``rte_power_monitor()`` is supported by the platform, then monitoring will be limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be monitored from a different lcore). -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the - ``rte_power_monitor()`` function is not supported, then monitor mode will not - be supported. +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the + two monitoring functions are supported, then monitor mode will not be supported. * Not all Ethernet devices support monitoring, even if the underlying platform may support the necessary CPU instructions. Please refer to diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index 7762cd39b8..97c9f1ea36 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -155,6 +155,32 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union queue *q) return 0; } +static inline int +get_monitor_addresses(struct pmd_core_cfg *cfg, + struct rte_power_monitor_cond *pmc, size_t len) +{ + const struct queue_list_entry *qle; + size_t i = 0; + int ret; + + TAILQ_FOREACH(qle, &cfg->head, next) { + const union queue *q = &qle->queue; + struct rte_power_monitor_cond *cur; + + /* attempted out of bounds access */ + if (i >= len) { + RTE_LOG(ERR, POWER, "Too many queues being monitored\n"); + return -1; + } + + cur = &pmc[i++]; + ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); + if (ret < 0) + return ret; + } + return 0; +} + static void calc_tsc(void) { @@ -183,6 +209,48 @@ calc_tsc(void) } } +static uint16_t +clb_multiwait(uint16_t port_id, uint16_t qidx, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *addr __rte_unused) +{ + const unsigned int lcore = rte_lcore_id(); + const union queue q = {.portid = port_id, .qid = qidx}; + const bool empty = nb_rx == 0; + struct pmd_core_cfg *q_conf; + + q_conf = &lcore_cfg[lcore]; + + /* early exit */ + if (likely(!empty)) { + q_conf->empty_poll_stats = 0; + } else { + /* do we care about this particular queue? */ + if (!queue_is_power_save(q_conf, &q)) + return nb_rx; + + /* +* we can increment unconditionally here because if there were +* non-empty polls in other queues assigned to this core, we +* dropped the counter to zero anyway. +*/ + q_conf->empty_poll_stats++; + if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) { + struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS]; + uint16_t ret; + + /* gather all monitoring conditions */ + ret = get_monitor_addresses(q_conf, pmc, RTE_DIM(pmc)); + + if (ret == 0) + rte_power_monitor_multi(pmc, + q_conf->n_queues, UINT64_MAX); + } + } + + return nb_rx; +} + static uint16_t clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, uint16_t max_pkts __rte_unused, @@ -348,14 +416,19 @@ static int check_monitor(struct pmd_core_cfg
[dpdk-dev] [PATCH v4 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes
Currently, l3fwd-power enforces the limitation of having one queue per lcore. This is no longer necessary, so remove the limitation, and always mark the last queue in qconf as the power save queue. Signed-off-by: Anatoly Burakov --- examples/l3fwd-power/main.c | 39 +++-- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f8dfed1634..3057c06936 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -2498,6 +2498,27 @@ mode_to_str(enum appmode mode) } } +static void +pmd_pmgmt_set_up(unsigned int lcore, uint16_t portid, uint16_t qid, bool last) +{ + int ret; + + ret = rte_power_ethdev_pmgmt_queue_enable(lcore, portid, + qid, pmgmt_type); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", + ret, portid); + + if (!last) + return; + ret = rte_power_ethdev_pmgmt_queue_set_power_save(lcore, portid, qid); + if (ret < 0) + rte_exit(EXIT_FAILURE, + "rte_power_ethdev_pmgmt_queue_set_power_save: err=%d, port=%d\n", + ret, portid); +} + int main(int argc, char **argv) { @@ -2723,12 +2744,6 @@ main(int argc, char **argv) printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); fflush(stdout); - /* PMD power management mode can only do 1 queue per core */ - if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) { - rte_exit(EXIT_FAILURE, - "In PMD power management mode, only one queue per lcore is allowed\n"); - } - /* init RX queues */ for(queue = 0; queue < qconf->n_rx_queue; ++queue) { struct rte_eth_rxconf rxq_conf; @@ -2767,15 +2782,9 @@ main(int argc, char **argv) "Fail to add ptype cb\n"); } - if (app_mode == APP_MODE_PMD_MGMT) { - ret = rte_power_ethdev_pmgmt_queue_enable( - lcore_id, portid, queueid, - pmgmt_type); - if (ret < 0) - rte_exit(EXIT_FAILURE, - "rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n", - ret, portid); - } + if (app_mode == APP_MODE_PMD_MGMT) + pmd_pmgmt_set_up(lcore_id, portid, queueid, + queue == (qconf->n_rx_queue - 1)); } } -- 2.25.1
[dpdk-dev] [PATCH v5 2/7] net/af_xdp: add power monitor support
Implement support for .get_monitor_addr in AF_XDP driver. Signed-off-by: Anatoly Burakov --- Notes: v2: - Rewrite using the callback mechanism drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + 1 file changed, 34 insertions(+) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index eb5660a3dc..7830d0c23a 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "compat.h" @@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev) return 0; } +#define CLB_VAL_IDX 0 +static int +eth_monitor_callback(const uint64_t value, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + const uint64_t v = opaque[CLB_VAL_IDX]; + const uint64_t m = (uint32_t)~0; + + /* if the value has changed, abort entering power optimized state */ + return (value & m) == v ? 0 : -1; +} + +static int +eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) +{ + struct pkt_rx_queue *rxq = rx_queue; + unsigned int *prod = rxq->rx.producer; + const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */ + + /* watch for changes in producer ring */ + pmc->addr = (void*)prod; + + /* store current value */ + pmc->opaque[CLB_VAL_IDX] = cur_val; + pmc->fn = eth_monitor_callback; + + /* AF_XDP producer ring index is 32-bit */ + pmc->size = sizeof(uint32_t); + + return 0; +} + static int eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { @@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = { .link_update = eth_link_update, .stats_get = eth_stats_get, .stats_reset = eth_stats_reset, + .get_monitor_addr = eth_get_monitor_addr }; /** parse busy_budget argument */ -- 2.25.1
[dpdk-dev] [PATCH v5 0/7] Enhancements for PMD power management
This patchset introduces several changes related to PMD power management: - Changed monitoring intrinsics to use callbacks as a comparison function, based on previous patchset [1] but incorporating feedback [2] - this hopefully will make it possible to add support for .get_monitor_addr in virtio - Add a new intrinsic to monitor multiple addresses, based on RTM instruction set and the TPAUSE instruction - Add support for PMD power management on multiple queues, as well as all accompanying infrastructure and example apps changes v5: - Removed "power save queue" API and replaced with mechanism suggested by Konstantin - Addressed other feedback v4: - Replaced raw number with a macro - Fixed all the bugs found by Konstantin - Some other minor corrections v3: - Moved some doc updates to NIC features list v2: - Changed check inversion to callbacks - Addressed feedback from Konstantin - Added doc updates where necessary [1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=* [2] http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274 Anatoly Burakov (7): power_intrinsics: use callbacks for comparison net/af_xdp: add power monitor support eal: add power monitor for multiple events power: remove thread safety from PMD power API's power: support callbacks for multiple Rx queues power: support monitoring multiple Rx queues l3fwd-power: support multiqueue in PMD pmgmt modes doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst | 68 +- doc/guides/rel_notes/release_21_08.rst| 11 + drivers/event/dlb2/dlb2.c | 17 +- drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + drivers/net/i40e/i40e_rxtx.c | 20 +- drivers/net/iavf/iavf_rxtx.c | 20 +- drivers/net/ice/ice_rxtx.c| 20 +- drivers/net/ixgbe/ixgbe_rxtx.c| 20 +- drivers/net/mlx5/mlx5_rx.c| 17 +- examples/l3fwd-power/main.c | 6 - lib/eal/arm/rte_power_intrinsics.c| 11 + lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 68 +- lib/eal/ppc/rte_power_intrinsics.c| 11 + lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 90 ++- lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c| 633 +- lib/power/rte_power_pmd_mgmt.h| 6 + 21 files changed, 810 insertions(+), 262 deletions(-) -- 2.25.1
[dpdk-dev] [PATCH v5 3/7] eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to what UMWAIT does, but without the limitation of having to listen for just one event. This works because the optimized power state used by the TPAUSE instruction will cause a wake up on RTM transaction abort, so if we add the addresses we're interested in to the read-set, any write to those addresses will wake us up. Signed-off-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v4: - Fixed bugs in accessing the monitor condition - Abort on any monitor condition not having a defined callback v2: - Adapt to callback mechanism doc/guides/rel_notes/release_21_08.rst| 2 + lib/eal/arm/rte_power_intrinsics.c| 11 +++ lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 35 + lib/eal/ppc/rte_power_intrinsics.c| 11 +++ lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 73 +++ 8 files changed, 139 insertions(+) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index c84ac280f5..9d1cfac395 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -55,6 +55,8 @@ New Features Also, make sure to start the actual text at the margin. === +* eal: added ``rte_power_monitor_multi`` to support waiting for multiple events. + Removed Items - diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c index e83f04072a..78f55b7203 100644 --- a/lib/eal/arm/rte_power_intrinsics.c +++ b/lib/eal/arm/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index 28a5aecde8..d35551e931 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -24,6 +24,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_monitor function */ uint32_t power_pause : 1; /**< indicates support for rte_power_pause function */ + uint32_t power_monitor_multi : 1; + /**< indicates support for rte_power_monitor_multi function */ }; /** diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h index c9aa52a86d..04e8c2ab37 100644 --- a/lib/eal/include/generic/rte_power_intrinsics.h +++ b/lib/eal/include/generic/rte_power_intrinsics.h @@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id); __rte_experimental int rte_power_pause(const uint64_t tsc_timestamp); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Monitor a set of addresses for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either one of the specified + * memory addresses is written to, a certain TSC timestamp is reached, or other + * reasons cause the CPU to wake up. + * + * Additionally, `expected` 64-bit values and 64-bit masks are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they do not match, the entering of + * optimized power state may be aborted. + * + * @warning It is responsibility of the user to check if this function is + * supported at runtime using `rte_cpu_get_intrinsics_support()` API call. + * Failing to do so may result in an illegal CPU instruction error. + * + * @param pmc + * An array of monitoring condition structures. + * @param num + * Length of the `pmc` array. + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * 0 on success + * -EINVAL on invalid parameters + * -ENOTSUP if unsupported + */ +__rte_experimental +int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp); + #endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c index 7fc9586da7..f00b58ade5 100644 --- a/lib/eal/ppc/rte_power_intrinsics.c +++ b/lib/eal/ppc/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], +
[dpdk-dev] [PATCH v5 1/7] power_intrinsics: use callbacks for comparison
Previously, the semantics of power monitor were such that we were checking current value against the expected value, and if they matched, then the sleep was aborted. This is somewhat inflexible, because it only allowed us to check for a specific value in a specific way. This commit replaces the comparison with a user callback mechanism, so that any PMD (or other code) using `rte_power_monitor()` can define their own comparison semantics and decision making on how to detect the need to abort the entering of power optimized state. Existing implementations are adjusted to follow the new semantics. Suggested-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov Acked-by: Konstantin Ananyev --- Notes: v4: - Return error if callback is set to NULL - Replace raw number with a macro in monitor condition opaque data v2: - Use callback mechanism for more flexibility - Address feedback from Konstantin doc/guides/rel_notes/release_21_08.rst| 1 + drivers/event/dlb2/dlb2.c | 17 -- drivers/net/i40e/i40e_rxtx.c | 20 +++ drivers/net/iavf/iavf_rxtx.c | 20 +++ drivers/net/ice/ice_rxtx.c| 20 +++ drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++ drivers/net/mlx5/mlx5_rx.c| 17 -- .../include/generic/rte_power_intrinsics.h| 33 +++ lib/eal/x86/rte_power_intrinsics.c| 17 +- 9 files changed, 121 insertions(+), 44 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index a6ecfdf3ce..c84ac280f5 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -84,6 +84,7 @@ API Changes Also, make sure to start the actual text at the margin. === +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. ABI Changes --- diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c index eca183753f..252bbd8d5e 100644 --- a/drivers/event/dlb2/dlb2.c +++ b/drivers/event/dlb2/dlb2.c @@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num) } } +#define CLB_MASK_IDX 0 +#define CLB_VAL_IDX 1 +static int +dlb2_monitor_callback(const uint64_t val, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + /* abort if the value matches */ + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0; +} + static inline int dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, struct dlb2_eventdev_port *ev_port, @@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, expected_value = 0; pmc.addr = monitor_addr; - pmc.val = expected_value; - pmc.mask = qe_mask.raw_qe[1]; + /* store expected value and comparison mask in opaque data */ + pmc.opaque[CLB_VAL_IDX] = expected_value; + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1]; + /* set up callback */ + pmc.fn = dlb2_monitor_callback; pmc.size = sizeof(uint64_t); rte_power_monitor(&pmc, timeout + start_ticks); diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 6c58decece..081682f88b 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -81,6 +81,18 @@ #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK) +static int +i40e_monitor_callback(const uint64_t value, + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* +* we expect the DD bit to be set to 1 if this descriptor was already +* written to. +*/ + return (value & m) == m ? -1 : 0; +} + int i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) { @@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) /* watch for changes in status bit */ pmc->addr = &rxdp->wb.qword1.status_error_len; - /* -* we expect the DD bit to be set to 1 if this descriptor was already -* written to. -*/ - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); - pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* comparison callback */ + pmc->fn = i40e_monitor_callback; /* registers are 64-bit */ pmc->size = sizeof(uint64_t); diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 0361af0d85..7ed196ec22 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rx
[dpdk-dev] [PATCH v5 4/7] power: remove thread safety from PMD power API's
Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. Signed-off-by: Anatoly Burakov --- Notes: v2: - Add check for stopped queue - Clarified doc message - Added release notes doc/guides/rel_notes/release_21_08.rst | 5 + lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c | 133 ++--- lib/power/rte_power_pmd_mgmt.h | 6 ++ 4 files changed, 67 insertions(+), 80 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index 9d1cfac395..f015c509fc 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -88,6 +88,11 @@ API Changes * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. +* rte_power: The experimental PMD power management API is no longer considered + to be thread safe; all Rx queues affected by the API will now need to be + stopped before making any changes to the power management scheme. + + ABI Changes --- diff --git a/lib/power/meson.build b/lib/power/meson.build index c1097d32f1..4f6a242364 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -21,4 +21,7 @@ headers = files( 'rte_power_pmd_mgmt.h', 'rte_power_guest_channel.h', ) +if cc.has_argument('-Wno-cast-qual') +cflags += '-Wno-cast-qual' +endif deps += ['timer', 'ethdev'] diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index db03cbf420..9b95cf1794 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -40,8 +40,6 @@ struct pmd_queue_cfg { /**< Callback mode for this queue */ const struct rte_eth_rxtx_callback *cur_cb; /**< Callback instance */ - volatile bool umwait_in_progress; - /**< are we currently sleeping? */ uint64_t empty_poll_stats; /**< Number of empty polls */ } __rte_cache_aligned; @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, struct rte_power_monitor_cond pmc; uint16_t ret; - /* -* we might get a cancellation request while being -* inside the callback, in which case the wakeup -* wouldn't work because it would've arrived too early. -* -* to get around this, we notify the other thread that -* we're sleeping, so that it can spin until we're done. -* unsolicited wakeups are perfectly safe. -*/ - q_conf->umwait_in_progress = true; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); - - /* check if we need to cancel sleep */ - if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) { - /* use monitoring condition to sleep */ - ret = rte_eth_get_monitor_addr(port_id, qidx, - &pmc); - if (ret == 0) - rte_power_monitor(&pmc, UINT64_MAX); - } - q_conf->umwait_in_progress = false; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); + /* use monitoring condition to sleep */ + ret = rte_eth_get_monitor_addr(port_id, qidx, + &pmc); + if (ret == 0) + rte_power_monitor(&pmc, UINT64_MAX); } } else q_conf->empty_poll_stats = 0; @@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx, return nb_rx; } +static int +queue_stopped(const uint16_t port_id, const uint16_t queue_id) +{ + struct rte_eth_rxq_info qinfo; + + if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) + return -1; + + return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; +} + int rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint
[dpdk-dev] [PATCH v5 6/7] power: support monitoring multiple Rx queues
Use the new multi-monitor intrinsic to allow monitoring multiple ethdev Rx queues while entering the energy efficient power state. The multi version will be used unconditionally if supported, and the UMWAIT one will only be used when multi-monitor is not supported by the hardware. Signed-off-by: Anatoly Burakov --- Notes: v4: - Fix possible out of bounds access - Added missing index increment doc/guides/prog_guide/power_man.rst | 9 ++-- lib/power/rte_power_pmd_mgmt.c | 81 - 2 files changed, 85 insertions(+), 5 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index ec04a72108..94353ca012 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number. The "monitor" mode is only supported in the following configurations and scenarios: * If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor_multi()`` function is supported by the platform, then + monitoring multiple Ethernet Rx queues for traffic will be supported. + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only ``rte_power_monitor()`` is supported by the platform, then monitoring will be limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be monitored from a different lcore). -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the - ``rte_power_monitor()`` function is not supported, then monitor mode will not - be supported. +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the + two monitoring functions are supported, then monitor mode will not be supported. * Not all Ethernet drivers support monitoring, even if the underlying platform may support the necessary CPU instructions. Please refer to diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index fccfd236c2..2056996b9c 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -124,6 +124,32 @@ queue_list_take(struct pmd_core_cfg *cfg, const union queue *q) return found; } +static inline int +get_monitor_addresses(struct pmd_core_cfg *cfg, + struct rte_power_monitor_cond *pmc, size_t len) +{ + const struct queue_list_entry *qle; + size_t i = 0; + int ret; + + TAILQ_FOREACH(qle, &cfg->head, next) { + const union queue *q = &qle->queue; + struct rte_power_monitor_cond *cur; + + /* attempted out of bounds access */ + if (i >= len) { + RTE_LOG(ERR, POWER, "Too many queues being monitored\n"); + return -1; + } + + cur = &pmc[i++]; + ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); + if (ret < 0) + return ret; + } + return 0; +} + static void calc_tsc(void) { @@ -190,6 +216,45 @@ lcore_can_sleep(struct pmd_core_cfg *cfg) return true; } +static uint16_t +clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *arg) +{ + const unsigned int lcore = rte_lcore_id(); + struct queue_list_entry *queue_conf = arg; + struct pmd_core_cfg *lcore_conf; + const bool empty = nb_rx == 0; + + lcore_conf = &lcore_cfgs[lcore]; + + /* early exit */ + if (likely(!empty)) + /* early exit */ + queue_reset(lcore_conf, queue_conf); + else { + struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS]; + int ret; + + /* can this queue sleep? */ + if (!queue_can_sleep(lcore_conf, queue_conf)) + return nb_rx; + + /* can this lcore sleep? */ + if (!lcore_can_sleep(lcore_conf)) + return nb_rx; + + /* gather all monitoring conditions */ + ret = get_monitor_addresses(lcore_conf, pmc, RTE_DIM(pmc)); + if (ret < 0) + return nb_rx; + + rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX); + } + + return nb_rx; +} + static uint16_t clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg) @@ -341,14 +406,19 @@ static int check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata) { struct rte_power_monitor_cond dummy; + bool multimonitor_supported; /* check if rte_power_monitor is supported */ if (!global_data.intrinsics_support.power_monitor) {
[dpdk-dev] [PATCH v5 5/7] power: support callbacks for multiple Rx queues
Currently, there is a hard limitation on the PMD power management support that only allows it to support a single queue per lcore. This is not ideal as most DPDK use cases will poll multiple queues per core. The PMD power management mechanism relies on ethdev Rx callbacks, so it is very difficult to implement such support because callbacks are effectively stateless and have no visibility into what the other ethdev devices are doing. This places limitations on what we can do within the framework of Rx callbacks, but the basics of this implementation are as follows: - Replace per-queue structures with per-lcore ones, so that any device polled from the same lcore can share data - Any queue that is going to be polled from a specific lcore has to be added to the list of queues to poll, so that the callback is aware of other queues being polled by the same lcore - Both the empty poll counter and the actual power saving mechanism is shared between all queues polled on a particular lcore, and is only activated when all queues in the list were polled and were determined to have no traffic. - The limitation on UMWAIT-based polling is not removed because UMWAIT is incapable of monitoring more than one address. Also, while we're at it, update and improve the docs. Signed-off-by: Anatoly Burakov --- Notes: v5: - Remove the "power save queue" API and replace it with mechanism suggested by Konstantin v3: - Move the list of supported NICs to NIC feature table v2: - Use a TAILQ for queues instead of a static array - Address feedback from Konstantin - Add additional checks for stopped queues doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst| 65 ++-- doc/guides/rel_notes/release_21_08.rst | 3 + lib/power/rte_power_pmd_mgmt.c | 431 ++--- 4 files changed, 373 insertions(+), 136 deletions(-) diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst index 403c2b03a3..a96e12d155 100644 --- a/doc/guides/nics/features.rst +++ b/doc/guides/nics/features.rst @@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information. * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``. * **[related] API**: ``rte_eth_rx_burst_mode_get()``, ``rte_eth_tx_burst_mode_get()``. +.. _nic_features_get_monitor_addr: + +PMD power management using monitor addresses + + +Supports getting a monitoring condition to use together with Ethernet PMD power +management (see :doc:`../prog_guide/power_man` for more details). + +* **[implements] eth_dev_ops**: ``get_monitor_addr`` + .. _nic_features_other: Other dev ops not represented by a Feature diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index c70ae128ac..ec04a72108 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -198,34 +198,41 @@ Ethernet PMD Power Management API Abstract -Existing power management mechanisms require developers -to change application design or change code to make use of it. -The PMD power management API provides a convenient alternative -by utilizing Ethernet PMD RX callbacks, -and triggering power saving whenever empty poll count reaches a certain number. - -Monitor - This power saving scheme will put the CPU into optimized power state - and use the ``rte_power_monitor()`` function - to monitor the Ethernet PMD RX descriptor address, - and wake the CPU up whenever there's new traffic. - -Pause - This power saving scheme will avoid busy polling - by either entering power-optimized sleep state - with ``rte_power_pause()`` function, - or, if it's not available, use ``rte_pause()``. - -Frequency scaling - This power saving scheme will use ``librte_power`` library - functionality to scale the core frequency up/down - depending on traffic volume. - -.. note:: - - Currently, this power management API is limited to mandatory mapping - of 1 queue to 1 core (multiple queues are supported, - but they must be polled from different cores). +Existing power management mechanisms require developers to change application +design or change code to make use of it. The PMD power management API provides a +convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering +power saving whenever empty poll count reaches a certain number. + +* Monitor + This power saving scheme will put the CPU into optimized power state and + monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever + there's new traffic. Support for this scheme may not be available on all + platforms, and further limitations may apply (see below). + +* Pause + This power saving scheme will avoid busy polling by either entering + power-optimized sleep state with ``rte_power_pause()`` function, or, if
[dpdk-dev] [PATCH v5 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes
Currently, l3fwd-power enforces the limitation of having one queue per lcore. This is no longer necessary, so remove the limitation. Signed-off-by: Anatoly Burakov --- examples/l3fwd-power/main.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f8dfed1634..52f56dc405 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -2723,12 +2723,6 @@ main(int argc, char **argv) printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); fflush(stdout); - /* PMD power management mode can only do 1 queue per core */ - if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) { - rte_exit(EXIT_FAILURE, - "In PMD power management mode, only one queue per lcore is allowed\n"); - } - /* init RX queues */ for(queue = 0; queue < qconf->n_rx_queue; ++queue) { struct rte_eth_rxconf rxq_conf; -- 2.25.1
[dpdk-dev] [PATCH v6 0/7] Enhancements for PMD power management
This patchset introduces several changes related to PMD power management: - Changed monitoring intrinsics to use callbacks as a comparison function, based on previous patchset [1] but incorporating feedback [2] - this hopefully will make it possible to add support for .get_monitor_addr in virtio - Add a new intrinsic to monitor multiple addresses, based on RTM instruction set and the TPAUSE instruction - Add support for PMD power management on multiple queues, as well as all accompanying infrastructure and example apps changes v6: - Improved the algorithm for multi-queue sleep - Fixed segfault and addressed other feedback v5: - Removed "power save queue" API and replaced with mechanism suggested by Konstantin - Addressed other feedback v4: - Replaced raw number with a macro - Fixed all the bugs found by Konstantin - Some other minor corrections v3: - Moved some doc updates to NIC features list v2: - Changed check inversion to callbacks - Addressed feedback from Konstantin - Added doc updates where necessary [1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=* [2] http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274 Anatoly Burakov (7): power_intrinsics: use callbacks for comparison net/af_xdp: add power monitor support eal: add power monitor for multiple events power: remove thread safety from PMD power API's power: support callbacks for multiple Rx queues power: support monitoring multiple Rx queues l3fwd-power: support multiqueue in PMD pmgmt modes doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst | 68 +- doc/guides/rel_notes/release_21_08.rst| 11 + drivers/event/dlb2/dlb2.c | 17 +- drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + drivers/net/i40e/i40e_rxtx.c | 20 +- drivers/net/iavf/iavf_rxtx.c | 20 +- drivers/net/ice/ice_rxtx.c| 20 +- drivers/net/ixgbe/ixgbe_rxtx.c| 20 +- drivers/net/mlx5/mlx5_rx.c| 17 +- examples/l3fwd-power/main.c | 6 - lib/eal/arm/rte_power_intrinsics.c| 11 + lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 68 +- lib/eal/ppc/rte_power_intrinsics.c| 11 + lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 90 ++- lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c| 655 +- lib/power/rte_power_pmd_mgmt.h| 6 + 21 files changed, 832 insertions(+), 262 deletions(-) -- 2.25.1
[dpdk-dev] [PATCH v6 1/7] power_intrinsics: use callbacks for comparison
Previously, the semantics of power monitor were such that we were checking current value against the expected value, and if they matched, then the sleep was aborted. This is somewhat inflexible, because it only allowed us to check for a specific value in a specific way. This commit replaces the comparison with a user callback mechanism, so that any PMD (or other code) using `rte_power_monitor()` can define their own comparison semantics and decision making on how to detect the need to abort the entering of power optimized state. Existing implementations are adjusted to follow the new semantics. Suggested-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov Acked-by: Konstantin Ananyev --- Notes: v4: - Return error if callback is set to NULL - Replace raw number with a macro in monitor condition opaque data v2: - Use callback mechanism for more flexibility - Address feedback from Konstantin doc/guides/rel_notes/release_21_08.rst| 1 + drivers/event/dlb2/dlb2.c | 17 -- drivers/net/i40e/i40e_rxtx.c | 20 +++ drivers/net/iavf/iavf_rxtx.c | 20 +++ drivers/net/ice/ice_rxtx.c| 20 +++ drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++ drivers/net/mlx5/mlx5_rx.c| 17 -- .../include/generic/rte_power_intrinsics.h| 33 +++ lib/eal/x86/rte_power_intrinsics.c| 17 +- 9 files changed, 121 insertions(+), 44 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index a6ecfdf3ce..c84ac280f5 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -84,6 +84,7 @@ API Changes Also, make sure to start the actual text at the margin. === +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. ABI Changes --- diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c index eca183753f..252bbd8d5e 100644 --- a/drivers/event/dlb2/dlb2.c +++ b/drivers/event/dlb2/dlb2.c @@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num) } } +#define CLB_MASK_IDX 0 +#define CLB_VAL_IDX 1 +static int +dlb2_monitor_callback(const uint64_t val, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + /* abort if the value matches */ + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0; +} + static inline int dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, struct dlb2_eventdev_port *ev_port, @@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, expected_value = 0; pmc.addr = monitor_addr; - pmc.val = expected_value; - pmc.mask = qe_mask.raw_qe[1]; + /* store expected value and comparison mask in opaque data */ + pmc.opaque[CLB_VAL_IDX] = expected_value; + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1]; + /* set up callback */ + pmc.fn = dlb2_monitor_callback; pmc.size = sizeof(uint64_t); rte_power_monitor(&pmc, timeout + start_ticks); diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 6c58decece..081682f88b 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -81,6 +81,18 @@ #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK) +static int +i40e_monitor_callback(const uint64_t value, + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* +* we expect the DD bit to be set to 1 if this descriptor was already +* written to. +*/ + return (value & m) == m ? -1 : 0; +} + int i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) { @@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) /* watch for changes in status bit */ pmc->addr = &rxdp->wb.qword1.status_error_len; - /* -* we expect the DD bit to be set to 1 if this descriptor was already -* written to. -*/ - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); - pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* comparison callback */ + pmc->fn = i40e_monitor_callback; /* registers are 64-bit */ pmc->size = sizeof(uint64_t); diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index 0361af0d85..7ed196ec22 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rx
[dpdk-dev] [PATCH v6 2/7] net/af_xdp: add power monitor support
Implement support for .get_monitor_addr in AF_XDP driver. Signed-off-by: Anatoly Burakov --- Notes: v2: - Rewrite using the callback mechanism drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + 1 file changed, 34 insertions(+) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index eb5660a3dc..7830d0c23a 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "compat.h" @@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev) return 0; } +#define CLB_VAL_IDX 0 +static int +eth_monitor_callback(const uint64_t value, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + const uint64_t v = opaque[CLB_VAL_IDX]; + const uint64_t m = (uint32_t)~0; + + /* if the value has changed, abort entering power optimized state */ + return (value & m) == v ? 0 : -1; +} + +static int +eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) +{ + struct pkt_rx_queue *rxq = rx_queue; + unsigned int *prod = rxq->rx.producer; + const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */ + + /* watch for changes in producer ring */ + pmc->addr = (void*)prod; + + /* store current value */ + pmc->opaque[CLB_VAL_IDX] = cur_val; + pmc->fn = eth_monitor_callback; + + /* AF_XDP producer ring index is 32-bit */ + pmc->size = sizeof(uint32_t); + + return 0; +} + static int eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { @@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = { .link_update = eth_link_update, .stats_get = eth_stats_get, .stats_reset = eth_stats_reset, + .get_monitor_addr = eth_get_monitor_addr }; /** parse busy_budget argument */ -- 2.25.1
[dpdk-dev] [PATCH v6 3/7] eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to what UMWAIT does, but without the limitation of having to listen for just one event. This works because the optimized power state used by the TPAUSE instruction will cause a wake up on RTM transaction abort, so if we add the addresses we're interested in to the read-set, any write to those addresses will wake us up. Signed-off-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v4: - Fixed bugs in accessing the monitor condition - Abort on any monitor condition not having a defined callback v2: - Adapt to callback mechanism doc/guides/rel_notes/release_21_08.rst| 2 + lib/eal/arm/rte_power_intrinsics.c| 11 +++ lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 35 + lib/eal/ppc/rte_power_intrinsics.c| 11 +++ lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 73 +++ 8 files changed, 139 insertions(+) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index c84ac280f5..9d1cfac395 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -55,6 +55,8 @@ New Features Also, make sure to start the actual text at the margin. === +* eal: added ``rte_power_monitor_multi`` to support waiting for multiple events. + Removed Items - diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c index e83f04072a..78f55b7203 100644 --- a/lib/eal/arm/rte_power_intrinsics.c +++ b/lib/eal/arm/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index 28a5aecde8..d35551e931 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -24,6 +24,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_monitor function */ uint32_t power_pause : 1; /**< indicates support for rte_power_pause function */ + uint32_t power_monitor_multi : 1; + /**< indicates support for rte_power_monitor_multi function */ }; /** diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h index c9aa52a86d..04e8c2ab37 100644 --- a/lib/eal/include/generic/rte_power_intrinsics.h +++ b/lib/eal/include/generic/rte_power_intrinsics.h @@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id); __rte_experimental int rte_power_pause(const uint64_t tsc_timestamp); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Monitor a set of addresses for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either one of the specified + * memory addresses is written to, a certain TSC timestamp is reached, or other + * reasons cause the CPU to wake up. + * + * Additionally, `expected` 64-bit values and 64-bit masks are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they do not match, the entering of + * optimized power state may be aborted. + * + * @warning It is responsibility of the user to check if this function is + * supported at runtime using `rte_cpu_get_intrinsics_support()` API call. + * Failing to do so may result in an illegal CPU instruction error. + * + * @param pmc + * An array of monitoring condition structures. + * @param num + * Length of the `pmc` array. + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * 0 on success + * -EINVAL on invalid parameters + * -ENOTSUP if unsupported + */ +__rte_experimental +int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp); + #endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c index 7fc9586da7..f00b58ade5 100644 --- a/lib/eal/ppc/rte_power_intrinsics.c +++ b/lib/eal/ppc/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], +
[dpdk-dev] [PATCH v6 4/7] power: remove thread safety from PMD power API's
Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. Signed-off-by: Anatoly Burakov --- Notes: v2: - Add check for stopped queue - Clarified doc message - Added release notes doc/guides/rel_notes/release_21_08.rst | 5 + lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c | 133 ++--- lib/power/rte_power_pmd_mgmt.h | 6 ++ 4 files changed, 67 insertions(+), 80 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index 9d1cfac395..f015c509fc 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -88,6 +88,11 @@ API Changes * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. +* rte_power: The experimental PMD power management API is no longer considered + to be thread safe; all Rx queues affected by the API will now need to be + stopped before making any changes to the power management scheme. + + ABI Changes --- diff --git a/lib/power/meson.build b/lib/power/meson.build index c1097d32f1..4f6a242364 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -21,4 +21,7 @@ headers = files( 'rte_power_pmd_mgmt.h', 'rte_power_guest_channel.h', ) +if cc.has_argument('-Wno-cast-qual') +cflags += '-Wno-cast-qual' +endif deps += ['timer', 'ethdev'] diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index db03cbf420..9b95cf1794 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -40,8 +40,6 @@ struct pmd_queue_cfg { /**< Callback mode for this queue */ const struct rte_eth_rxtx_callback *cur_cb; /**< Callback instance */ - volatile bool umwait_in_progress; - /**< are we currently sleeping? */ uint64_t empty_poll_stats; /**< Number of empty polls */ } __rte_cache_aligned; @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, struct rte_power_monitor_cond pmc; uint16_t ret; - /* -* we might get a cancellation request while being -* inside the callback, in which case the wakeup -* wouldn't work because it would've arrived too early. -* -* to get around this, we notify the other thread that -* we're sleeping, so that it can spin until we're done. -* unsolicited wakeups are perfectly safe. -*/ - q_conf->umwait_in_progress = true; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); - - /* check if we need to cancel sleep */ - if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) { - /* use monitoring condition to sleep */ - ret = rte_eth_get_monitor_addr(port_id, qidx, - &pmc); - if (ret == 0) - rte_power_monitor(&pmc, UINT64_MAX); - } - q_conf->umwait_in_progress = false; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); + /* use monitoring condition to sleep */ + ret = rte_eth_get_monitor_addr(port_id, qidx, + &pmc); + if (ret == 0) + rte_power_monitor(&pmc, UINT64_MAX); } } else q_conf->empty_poll_stats = 0; @@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx, return nb_rx; } +static int +queue_stopped(const uint16_t port_id, const uint16_t queue_id) +{ + struct rte_eth_rxq_info qinfo; + + if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) + return -1; + + return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; +} + int rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint
[dpdk-dev] [PATCH v6 5/7] power: support callbacks for multiple Rx queues
Currently, there is a hard limitation on the PMD power management support that only allows it to support a single queue per lcore. This is not ideal as most DPDK use cases will poll multiple queues per core. The PMD power management mechanism relies on ethdev Rx callbacks, so it is very difficult to implement such support because callbacks are effectively stateless and have no visibility into what the other ethdev devices are doing. This places limitations on what we can do within the framework of Rx callbacks, but the basics of this implementation are as follows: - Replace per-queue structures with per-lcore ones, so that any device polled from the same lcore can share data - Any queue that is going to be polled from a specific lcore has to be added to the list of queues to poll, so that the callback is aware of other queues being polled by the same lcore - Both the empty poll counter and the actual power saving mechanism is shared between all queues polled on a particular lcore, and is only activated when all queues in the list were polled and were determined to have no traffic. - The limitation on UMWAIT-based polling is not removed because UMWAIT is incapable of monitoring more than one address. Also, while we're at it, update and improve the docs. Signed-off-by: Anatoly Burakov --- Notes: v6: - Track each individual queue sleep status (Konstantin) - Fix segfault (Dave) v5: - Remove the "power save queue" API and replace it with mechanism suggested by Konstantin v3: - Move the list of supported NICs to NIC feature table v2: - Use a TAILQ for queues instead of a static array - Address feedback from Konstantin - Add additional checks for stopped queues doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst| 65 ++-- doc/guides/rel_notes/release_21_08.rst | 3 + lib/power/rte_power_pmd_mgmt.c | 452 +++-- 4 files changed, 394 insertions(+), 136 deletions(-) diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst index 403c2b03a3..a96e12d155 100644 --- a/doc/guides/nics/features.rst +++ b/doc/guides/nics/features.rst @@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information. * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``. * **[related] API**: ``rte_eth_rx_burst_mode_get()``, ``rte_eth_tx_burst_mode_get()``. +.. _nic_features_get_monitor_addr: + +PMD power management using monitor addresses + + +Supports getting a monitoring condition to use together with Ethernet PMD power +management (see :doc:`../prog_guide/power_man` for more details). + +* **[implements] eth_dev_ops**: ``get_monitor_addr`` + .. _nic_features_other: Other dev ops not represented by a Feature diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index c70ae128ac..ec04a72108 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -198,34 +198,41 @@ Ethernet PMD Power Management API Abstract -Existing power management mechanisms require developers -to change application design or change code to make use of it. -The PMD power management API provides a convenient alternative -by utilizing Ethernet PMD RX callbacks, -and triggering power saving whenever empty poll count reaches a certain number. - -Monitor - This power saving scheme will put the CPU into optimized power state - and use the ``rte_power_monitor()`` function - to monitor the Ethernet PMD RX descriptor address, - and wake the CPU up whenever there's new traffic. - -Pause - This power saving scheme will avoid busy polling - by either entering power-optimized sleep state - with ``rte_power_pause()`` function, - or, if it's not available, use ``rte_pause()``. - -Frequency scaling - This power saving scheme will use ``librte_power`` library - functionality to scale the core frequency up/down - depending on traffic volume. - -.. note:: - - Currently, this power management API is limited to mandatory mapping - of 1 queue to 1 core (multiple queues are supported, - but they must be polled from different cores). +Existing power management mechanisms require developers to change application +design or change code to make use of it. The PMD power management API provides a +convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering +power saving whenever empty poll count reaches a certain number. + +* Monitor + This power saving scheme will put the CPU into optimized power state and + monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever + there's new traffic. Support for this scheme may not be available on all + platforms, and further limitations may apply (see below). + +* Pause + This power saving scheme will avoid busy pollin
[dpdk-dev] [PATCH v6 6/7] power: support monitoring multiple Rx queues
Use the new multi-monitor intrinsic to allow monitoring multiple ethdev Rx queues while entering the energy efficient power state. The multi version will be used unconditionally if supported, and the UMWAIT one will only be used when multi-monitor is not supported by the hardware. Signed-off-by: Anatoly Burakov --- Notes: v6: - Fix the missed feedback from v5 v4: - Fix possible out of bounds access - Added missing index increment doc/guides/prog_guide/power_man.rst | 9 ++-- lib/power/rte_power_pmd_mgmt.c | 82 - 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index ec04a72108..94353ca012 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain number. The "monitor" mode is only supported in the following configurations and scenarios: * If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor_multi()`` function is supported by the platform, then + monitoring multiple Ethernet Rx queues for traffic will be supported. + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only ``rte_power_monitor()`` is supported by the platform, then monitoring will be limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be monitored from a different lcore). -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the - ``rte_power_monitor()`` function is not supported, then monitor mode will not - be supported. +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the + two monitoring functions are supported, then monitor mode will not be supported. * Not all Ethernet drivers support monitoring, even if the underlying platform may support the necessary CPU instructions. Please refer to diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index 9ffeda05ed..0c45469619 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -126,6 +126,32 @@ queue_list_take(struct pmd_core_cfg *cfg, const union queue *q) return found; } +static inline int +get_monitor_addresses(struct pmd_core_cfg *cfg, + struct rte_power_monitor_cond *pmc, size_t len) +{ + const struct queue_list_entry *qle; + size_t i = 0; + int ret; + + TAILQ_FOREACH(qle, &cfg->head, next) { + const union queue *q = &qle->queue; + struct rte_power_monitor_cond *cur; + + /* attempted out of bounds access */ + if (i >= len) { + RTE_LOG(ERR, POWER, "Too many queues being monitored\n"); + return -1; + } + + cur = &pmc[i++]; + ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); + if (ret < 0) + return ret; + } + return 0; +} + static void calc_tsc(void) { @@ -211,6 +237,46 @@ lcore_can_sleep(struct pmd_core_cfg *cfg) return true; } +static uint16_t +clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *arg) +{ + const unsigned int lcore = rte_lcore_id(); + struct queue_list_entry *queue_conf = arg; + struct pmd_core_cfg *lcore_conf; + const bool empty = nb_rx == 0; + + lcore_conf = &lcore_cfgs[lcore]; + + /* early exit */ + if (likely(!empty)) + /* early exit */ + queue_reset(lcore_conf, queue_conf); + else { + struct rte_power_monitor_cond pmc[lcore_conf->n_queues]; + int ret; + + /* can this queue sleep? */ + if (!queue_can_sleep(lcore_conf, queue_conf)) + return nb_rx; + + /* can this lcore sleep? */ + if (!lcore_can_sleep(lcore_conf)) + return nb_rx; + + /* gather all monitoring conditions */ + ret = get_monitor_addresses(lcore_conf, pmc, + lcore_conf->n_queues); + if (ret < 0) + return nb_rx; + + rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX); + } + + return nb_rx; +} + static uint16_t clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg) @@ -362,14 +428,19 @@ static int check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata) { struct rte_power_monitor_cond dummy; + bool multimonitor_supported;
[dpdk-dev] [PATCH v6 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes
Currently, l3fwd-power enforces the limitation of having one queue per lcore. This is no longer necessary, so remove the limitation. Signed-off-by: Anatoly Burakov --- examples/l3fwd-power/main.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f8dfed1634..52f56dc405 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -2723,12 +2723,6 @@ main(int argc, char **argv) printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); fflush(stdout); - /* PMD power management mode can only do 1 queue per core */ - if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) { - rte_exit(EXIT_FAILURE, - "In PMD power management mode, only one queue per lcore is allowed\n"); - } - /* init RX queues */ for(queue = 0; queue < qconf->n_rx_queue; ++queue) { struct rte_eth_rxconf rxq_conf; -- 2.25.1
[dpdk-dev] [PATCH v7 0/7] Enhancements for PMD power management
This patchset introduces several changes related to PMD power management: - Changed monitoring intrinsics to use callbacks as a comparison function, based on previous patchset [1] but incorporating feedback [2] - this hopefully will make it possible to add support for .get_monitor_addr in virtio - Add a new intrinsic to monitor multiple addresses, based on RTM instruction set and the TPAUSE instruction - Add support for PMD power management on multiple queues, as well as all accompanying infrastructure and example apps changes v7: - Fixed various bugs v6: - Improved the algorithm for multi-queue sleep - Fixed segfault and addressed other feedback v5: - Removed "power save queue" API and replaced with mechanism suggested by Konstantin - Addressed other feedback v4: - Replaced raw number with a macro - Fixed all the bugs found by Konstantin - Some other minor corrections v3: - Moved some doc updates to NIC features list v2: - Changed check inversion to callbacks - Addressed feedback from Konstantin - Added doc updates where necessary [1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=* [2] http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274 Anatoly Burakov (7): power_intrinsics: use callbacks for comparison net/af_xdp: add power monitor support eal: add power monitor for multiple events power: remove thread safety from PMD power API's power: support callbacks for multiple Rx queues power: support monitoring multiple Rx queues l3fwd-power: support multiqueue in PMD pmgmt modes doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst | 74 +- doc/guides/rel_notes/release_21_08.rst| 9 + drivers/event/dlb2/dlb2.c | 17 +- drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + drivers/net/i40e/i40e_rxtx.c | 20 +- drivers/net/iavf/iavf_rxtx.c | 20 +- drivers/net/ice/ice_rxtx.c| 20 +- drivers/net/ixgbe/ixgbe_rxtx.c| 20 +- drivers/net/mlx5/mlx5_rx.c| 17 +- examples/l3fwd-power/main.c | 6 - lib/eal/arm/rte_power_intrinsics.c| 11 + lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 68 +- lib/eal/ppc/rte_power_intrinsics.c| 11 + lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 90 ++- lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c| 659 +- lib/power/rte_power_pmd_mgmt.h| 6 + 21 files changed, 840 insertions(+), 262 deletions(-) -- 2.25.1
[dpdk-dev] [PATCH v7 1/7] power_intrinsics: use callbacks for comparison
Previously, the semantics of power monitor were such that we were checking current value against the expected value, and if they matched, then the sleep was aborted. This is somewhat inflexible, because it only allowed us to check for a specific value in a specific way. This commit replaces the comparison with a user callback mechanism, so that any PMD (or other code) using `rte_power_monitor()` can define their own comparison semantics and decision making on how to detect the need to abort the entering of power optimized state. Existing implementations are adjusted to follow the new semantics. Suggested-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov Acked-by: Konstantin Ananyev --- Notes: v4: - Return error if callback is set to NULL - Replace raw number with a macro in monitor condition opaque data v2: - Use callback mechanism for more flexibility - Address feedback from Konstantin doc/guides/rel_notes/release_21_08.rst| 2 ++ drivers/event/dlb2/dlb2.c | 17 -- drivers/net/i40e/i40e_rxtx.c | 20 +++ drivers/net/iavf/iavf_rxtx.c | 20 +++ drivers/net/ice/ice_rxtx.c| 20 +++ drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++ drivers/net/mlx5/mlx5_rx.c| 17 -- .../include/generic/rte_power_intrinsics.h| 33 +++ lib/eal/x86/rte_power_intrinsics.c| 17 +- 9 files changed, 122 insertions(+), 44 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index cd02820e68..c1d063bb11 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -117,6 +117,8 @@ API Changes * eal: ``rte_strscpy`` sets ``rte_errno`` to ``E2BIG`` in case of string truncation. +* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. + ABI Changes --- diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c index eca183753f..252bbd8d5e 100644 --- a/drivers/event/dlb2/dlb2.c +++ b/drivers/event/dlb2/dlb2.c @@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num) } } +#define CLB_MASK_IDX 0 +#define CLB_VAL_IDX 1 +static int +dlb2_monitor_callback(const uint64_t val, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + /* abort if the value matches */ + return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0; +} + static inline int dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, struct dlb2_eventdev_port *ev_port, @@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2, expected_value = 0; pmc.addr = monitor_addr; - pmc.val = expected_value; - pmc.mask = qe_mask.raw_qe[1]; + /* store expected value and comparison mask in opaque data */ + pmc.opaque[CLB_VAL_IDX] = expected_value; + pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1]; + /* set up callback */ + pmc.fn = dlb2_monitor_callback; pmc.size = sizeof(uint64_t); rte_power_monitor(&pmc, timeout + start_ticks); diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c index 8d65f287f4..65f325ede1 100644 --- a/drivers/net/i40e/i40e_rxtx.c +++ b/drivers/net/i40e/i40e_rxtx.c @@ -81,6 +81,18 @@ #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \ (PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK) +static int +i40e_monitor_callback(const uint64_t value, + const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused) +{ + const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* +* we expect the DD bit to be set to 1 if this descriptor was already +* written to. +*/ + return (value & m) == m ? -1 : 0; +} + int i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) { @@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) /* watch for changes in status bit */ pmc->addr = &rxdp->wb.qword1.status_error_len; - /* -* we expect the DD bit to be set to 1 if this descriptor was already -* written to. -*/ - pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); - pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT); + /* comparison callback */ + pmc->fn = i40e_monitor_callback; /* registers are 64-bit */ pmc->size = sizeof(uint64_t); diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index f817fbc49b..d61b32fcee 100644 --- a/drivers/net/iavf/iavf_rxtx.c +++ b/drivers/net/iavf/iavf_rxtx.c @@ -57,6 +57,
[dpdk-dev] [PATCH v7 2/7] net/af_xdp: add power monitor support
Implement support for .get_monitor_addr in AF_XDP driver. Signed-off-by: Anatoly Burakov --- Notes: v2: - Rewrite using the callback mechanism drivers/net/af_xdp/rte_eth_af_xdp.c | 34 + 1 file changed, 34 insertions(+) diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c b/drivers/net/af_xdp/rte_eth_af_xdp.c index eb5660a3dc..7830d0c23a 100644 --- a/drivers/net/af_xdp/rte_eth_af_xdp.c +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "compat.h" @@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev) return 0; } +#define CLB_VAL_IDX 0 +static int +eth_monitor_callback(const uint64_t value, + const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ]) +{ + const uint64_t v = opaque[CLB_VAL_IDX]; + const uint64_t m = (uint32_t)~0; + + /* if the value has changed, abort entering power optimized state */ + return (value & m) == v ? 0 : -1; +} + +static int +eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc) +{ + struct pkt_rx_queue *rxq = rx_queue; + unsigned int *prod = rxq->rx.producer; + const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */ + + /* watch for changes in producer ring */ + pmc->addr = (void*)prod; + + /* store current value */ + pmc->opaque[CLB_VAL_IDX] = cur_val; + pmc->fn = eth_monitor_callback; + + /* AF_XDP producer ring index is 32-bit */ + pmc->size = sizeof(uint32_t); + + return 0; +} + static int eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) { @@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = { .link_update = eth_link_update, .stats_get = eth_stats_get, .stats_reset = eth_stats_reset, + .get_monitor_addr = eth_get_monitor_addr }; /** parse busy_budget argument */ -- 2.25.1
[dpdk-dev] [PATCH v7 3/7] eal: add power monitor for multiple events
Use RTM and WAITPKG instructions to perform a wait-for-writes similar to what UMWAIT does, but without the limitation of having to listen for just one event. This works because the optimized power state used by the TPAUSE instruction will cause a wake up on RTM transaction abort, so if we add the addresses we're interested in to the read-set, any write to those addresses will wake us up. Signed-off-by: Konstantin Ananyev Signed-off-by: Anatoly Burakov --- Notes: v4: - Fixed bugs in accessing the monitor condition - Abort on any monitor condition not having a defined callback v2: - Adapt to callback mechanism lib/eal/arm/rte_power_intrinsics.c| 11 +++ lib/eal/include/generic/rte_cpuflags.h| 2 + .../include/generic/rte_power_intrinsics.h| 35 + lib/eal/ppc/rte_power_intrinsics.c| 11 +++ lib/eal/version.map | 3 + lib/eal/x86/rte_cpuflags.c| 2 + lib/eal/x86/rte_power_intrinsics.c| 73 +++ 7 files changed, 137 insertions(+) diff --git a/lib/eal/arm/rte_power_intrinsics.c b/lib/eal/arm/rte_power_intrinsics.c index e83f04072a..78f55b7203 100644 --- a/lib/eal/arm/rte_power_intrinsics.c +++ b/lib/eal/arm/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index 28a5aecde8..d35551e931 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -24,6 +24,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_monitor function */ uint32_t power_pause : 1; /**< indicates support for rte_power_pause function */ + uint32_t power_monitor_multi : 1; + /**< indicates support for rte_power_monitor_multi function */ }; /** diff --git a/lib/eal/include/generic/rte_power_intrinsics.h b/lib/eal/include/generic/rte_power_intrinsics.h index c9aa52a86d..04e8c2ab37 100644 --- a/lib/eal/include/generic/rte_power_intrinsics.h +++ b/lib/eal/include/generic/rte_power_intrinsics.h @@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id); __rte_experimental int rte_power_pause(const uint64_t tsc_timestamp); +/** + * @warning + * @b EXPERIMENTAL: this API may change without prior notice + * + * Monitor a set of addresses for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either one of the specified + * memory addresses is written to, a certain TSC timestamp is reached, or other + * reasons cause the CPU to wake up. + * + * Additionally, `expected` 64-bit values and 64-bit masks are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they do not match, the entering of + * optimized power state may be aborted. + * + * @warning It is responsibility of the user to check if this function is + * supported at runtime using `rte_cpu_get_intrinsics_support()` API call. + * Failing to do so may result in an illegal CPU instruction error. + * + * @param pmc + * An array of monitoring condition structures. + * @param num + * Length of the `pmc` array. + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * 0 on success + * -EINVAL on invalid parameters + * -ENOTSUP if unsupported + */ +__rte_experimental +int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp); + #endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/eal/ppc/rte_power_intrinsics.c b/lib/eal/ppc/rte_power_intrinsics.c index 7fc9586da7..f00b58ade5 100644 --- a/lib/eal/ppc/rte_power_intrinsics.c +++ b/lib/eal/ppc/rte_power_intrinsics.c @@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) return -ENOTSUP; } + +int +rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[], + const uint32_t num, const uint64_t tsc_timestamp) +{ + RTE_SET_USED(pmc); + RTE_SET_USED(num); + RTE_SET_USED(tsc_timestamp); + + return -ENOTSUP; +} diff --git a/lib/eal/version.map b/lib/eal/version.map index fe5c3dac98..4ccd5475d6 100644 --- a/lib/eal/version.map +++ b/lib/eal/version.map @@ -423,6 +423,9 @@ EXPERIMENTAL { rte_version_release; # WINDOWS_NO_EXPORT rte_version_suffix; # WINDOWS_NO_EXPORT rte_version_year; # WINDOWS_NO_EXPORT + + # added in 21.08 + rte_power
[dpdk-dev] [PATCH v7 4/7] power: remove thread safety from PMD power API's
Currently, we expect that only one callback can be active at any given moment, for a particular queue configuration, which is relatively easy to implement in a thread-safe way. However, we're about to add support for multiple queues per lcore, which will greatly increase the possibility of various race conditions. We could have used something like an RCU for this use case, but absent of a pressing need for thread safety we'll go the easy way and just mandate that the API's are to be called when all affected ports are stopped, and document this limitation. This greatly simplifies the `rte_power_monitor`-related code. Signed-off-by: Anatoly Burakov --- Notes: v2: - Add check for stopped queue - Clarified doc message - Added release notes doc/guides/rel_notes/release_21_08.rst | 4 + lib/power/meson.build | 3 + lib/power/rte_power_pmd_mgmt.c | 133 ++--- lib/power/rte_power_pmd_mgmt.h | 6 ++ 4 files changed, 66 insertions(+), 80 deletions(-) diff --git a/doc/guides/rel_notes/release_21_08.rst b/doc/guides/rel_notes/release_21_08.rst index c1d063bb11..4b84c89c0b 100644 --- a/doc/guides/rel_notes/release_21_08.rst +++ b/doc/guides/rel_notes/release_21_08.rst @@ -119,6 +119,10 @@ API Changes * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism. +* rte_power: The experimental PMD power management API is no longer considered + to be thread safe; all Rx queues affected by the API will now need to be + stopped before making any changes to the power management scheme. + ABI Changes --- diff --git a/lib/power/meson.build b/lib/power/meson.build index c1097d32f1..4f6a242364 100644 --- a/lib/power/meson.build +++ b/lib/power/meson.build @@ -21,4 +21,7 @@ headers = files( 'rte_power_pmd_mgmt.h', 'rte_power_guest_channel.h', ) +if cc.has_argument('-Wno-cast-qual') +cflags += '-Wno-cast-qual' +endif deps += ['timer', 'ethdev'] diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index db03cbf420..9b95cf1794 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -40,8 +40,6 @@ struct pmd_queue_cfg { /**< Callback mode for this queue */ const struct rte_eth_rxtx_callback *cur_cb; /**< Callback instance */ - volatile bool umwait_in_progress; - /**< are we currently sleeping? */ uint64_t empty_poll_stats; /**< Number of empty polls */ } __rte_cache_aligned; @@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts __rte_unused, struct rte_power_monitor_cond pmc; uint16_t ret; - /* -* we might get a cancellation request while being -* inside the callback, in which case the wakeup -* wouldn't work because it would've arrived too early. -* -* to get around this, we notify the other thread that -* we're sleeping, so that it can spin until we're done. -* unsolicited wakeups are perfectly safe. -*/ - q_conf->umwait_in_progress = true; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); - - /* check if we need to cancel sleep */ - if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) { - /* use monitoring condition to sleep */ - ret = rte_eth_get_monitor_addr(port_id, qidx, - &pmc); - if (ret == 0) - rte_power_monitor(&pmc, UINT64_MAX); - } - q_conf->umwait_in_progress = false; - - rte_atomic_thread_fence(__ATOMIC_SEQ_CST); + /* use monitoring condition to sleep */ + ret = rte_eth_get_monitor_addr(port_id, qidx, + &pmc); + if (ret == 0) + rte_power_monitor(&pmc, UINT64_MAX); } } else q_conf->empty_poll_stats = 0; @@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx, return nb_rx; } +static int +queue_stopped(const uint16_t port_id, const uint16_t queue_id) +{ + struct rte_eth_rxq_info qinfo; + + if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0) + return -1; + + return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED; +} + int rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint
[dpdk-dev] [PATCH v7 5/7] power: support callbacks for multiple Rx queues
Currently, there is a hard limitation on the PMD power management support that only allows it to support a single queue per lcore. This is not ideal as most DPDK use cases will poll multiple queues per core. The PMD power management mechanism relies on ethdev Rx callbacks, so it is very difficult to implement such support because callbacks are effectively stateless and have no visibility into what the other ethdev devices are doing. This places limitations on what we can do within the framework of Rx callbacks, but the basics of this implementation are as follows: - Replace per-queue structures with per-lcore ones, so that any device polled from the same lcore can share data - Any queue that is going to be polled from a specific lcore has to be added to the list of queues to poll, so that the callback is aware of other queues being polled by the same lcore - Both the empty poll counter and the actual power saving mechanism is shared between all queues polled on a particular lcore, and is only activated when all queues in the list were polled and were determined to have no traffic. - The limitation on UMWAIT-based polling is not removed because UMWAIT is incapable of monitoring more than one address. Also, while we're at it, update and improve the docs. Signed-off-by: Anatoly Burakov --- Notes: v7: - Fix bug where initial sleep target was always set to zero - Fix logic in handling of n_queues_ready_to_sleep counter - Update documentation on hardware requirements v6: - Track each individual queue sleep status (Konstantin) - Fix segfault (Dave) v5: - Remove the "power save queue" API and replace it with mechanism suggested by Konstantin v3: - Move the list of supported NICs to NIC feature table v2: - Use a TAILQ for queues instead of a static array - Address feedback from Konstantin - Add additional checks for stopped queues doc/guides/nics/features.rst | 10 + doc/guides/prog_guide/power_man.rst| 69 ++-- doc/guides/rel_notes/release_21_08.rst | 3 + lib/power/rte_power_pmd_mgmt.c | 456 +++-- 4 files changed, 402 insertions(+), 136 deletions(-) diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst index 403c2b03a3..a96e12d155 100644 --- a/doc/guides/nics/features.rst +++ b/doc/guides/nics/features.rst @@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information. * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``. * **[related] API**: ``rte_eth_rx_burst_mode_get()``, ``rte_eth_tx_burst_mode_get()``. +.. _nic_features_get_monitor_addr: + +PMD power management using monitor addresses + + +Supports getting a monitoring condition to use together with Ethernet PMD power +management (see :doc:`../prog_guide/power_man` for more details). + +* **[implements] eth_dev_ops**: ``get_monitor_addr`` + .. _nic_features_other: Other dev ops not represented by a Feature diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index c70ae128ac..0e66878892 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -198,34 +198,45 @@ Ethernet PMD Power Management API Abstract -Existing power management mechanisms require developers -to change application design or change code to make use of it. -The PMD power management API provides a convenient alternative -by utilizing Ethernet PMD RX callbacks, -and triggering power saving whenever empty poll count reaches a certain number. - -Monitor - This power saving scheme will put the CPU into optimized power state - and use the ``rte_power_monitor()`` function - to monitor the Ethernet PMD RX descriptor address, - and wake the CPU up whenever there's new traffic. - -Pause - This power saving scheme will avoid busy polling - by either entering power-optimized sleep state - with ``rte_power_pause()`` function, - or, if it's not available, use ``rte_pause()``. - -Frequency scaling - This power saving scheme will use ``librte_power`` library - functionality to scale the core frequency up/down - depending on traffic volume. - -.. note:: - - Currently, this power management API is limited to mandatory mapping - of 1 queue to 1 core (multiple queues are supported, - but they must be polled from different cores). +Existing power management mechanisms require developers to change application +design or change code to make use of it. The PMD power management API provides a +convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering +power saving whenever empty poll count reaches a certain number. + +* Monitor + This power saving scheme will put the CPU into optimized power state and + monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever + there's
[dpdk-dev] [PATCH v7 6/7] power: support monitoring multiple Rx queues
Use the new multi-monitor intrinsic to allow monitoring multiple ethdev Rx queues while entering the energy efficient power state. The multi version will be used unconditionally if supported, and the UMWAIT one will only be used when multi-monitor is not supported by the hardware. Signed-off-by: Anatoly Burakov --- Notes: v6: - Fix the missed feedback from v5 v4: - Fix possible out of bounds access - Added missing index increment doc/guides/prog_guide/power_man.rst | 15 -- lib/power/rte_power_pmd_mgmt.c | 82 - 2 files changed, 90 insertions(+), 7 deletions(-) diff --git a/doc/guides/prog_guide/power_man.rst b/doc/guides/prog_guide/power_man.rst index 0e66878892..e387d7811e 100644 --- a/doc/guides/prog_guide/power_man.rst +++ b/doc/guides/prog_guide/power_man.rst @@ -221,17 +221,22 @@ power saving whenever empty poll count reaches a certain number. The "monitor" mode is only supported in the following configurations and scenarios: * On Linux* x86_64, `rte_power_monitor()` requires WAITPKG instruction set being - supported by the CPU. Please refer to your platform documentation for further - information. + supported by the CPU, while `rte_power_monitor_multi()` requires WAITPKG and + RTM instruction sets being supported by the CPU. RTM instruction set may also + require booting the Linux with `tsx=on` command line parameter. Please refer + to your platform documentation for further information. * If ``rte_cpu_get_intrinsics_support()`` function indicates that + ``rte_power_monitor_multi()`` function is supported by the platform, then + monitoring multiple Ethernet Rx queues for traffic will be supported. + +* If ``rte_cpu_get_intrinsics_support()`` function indicates that only ``rte_power_monitor()`` is supported by the platform, then monitoring will be limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be monitored from a different lcore). -* If ``rte_cpu_get_intrinsics_support()`` function indicates that the - ``rte_power_monitor()`` function is not supported, then monitor mode will not - be supported. +* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of the + two monitoring functions are supported, then monitor mode will not be supported. * Not all Ethernet drivers support monitoring, even if the underlying platform may support the necessary CPU instructions. Please refer to diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index ceaf386d2b..ba5971f827 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -126,6 +126,32 @@ queue_list_take(struct pmd_core_cfg *cfg, const union queue *q) return found; } +static inline int +get_monitor_addresses(struct pmd_core_cfg *cfg, + struct rte_power_monitor_cond *pmc, size_t len) +{ + const struct queue_list_entry *qle; + size_t i = 0; + int ret; + + TAILQ_FOREACH(qle, &cfg->head, next) { + const union queue *q = &qle->queue; + struct rte_power_monitor_cond *cur; + + /* attempted out of bounds access */ + if (i >= len) { + RTE_LOG(ERR, POWER, "Too many queues being monitored\n"); + return -1; + } + + cur = &pmc[i++]; + ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur); + if (ret < 0) + return ret; + } + return 0; +} + static void calc_tsc(void) { @@ -211,6 +237,46 @@ lcore_can_sleep(struct pmd_core_cfg *cfg) return true; } +static uint16_t +clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused, + struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx, + uint16_t max_pkts __rte_unused, void *arg) +{ + const unsigned int lcore = rte_lcore_id(); + struct queue_list_entry *queue_conf = arg; + struct pmd_core_cfg *lcore_conf; + const bool empty = nb_rx == 0; + + lcore_conf = &lcore_cfgs[lcore]; + + /* early exit */ + if (likely(!empty)) + /* early exit */ + queue_reset(lcore_conf, queue_conf); + else { + struct rte_power_monitor_cond pmc[lcore_conf->n_queues]; + int ret; + + /* can this queue sleep? */ + if (!queue_can_sleep(lcore_conf, queue_conf)) + return nb_rx; + + /* can this lcore sleep? */ + if (!lcore_can_sleep(lcore_conf)) + return nb_rx; + + /* gather all monitoring conditions */ + ret = get_monitor_addresses(lcore_conf, pmc, + lcore_conf->n_queues); + if (ret < 0) + return nb_rx; + +
[dpdk-dev] [PATCH v7 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes
Currently, l3fwd-power enforces the limitation of having one queue per lcore. This is no longer necessary, so remove the limitation. Signed-off-by: Anatoly Burakov --- examples/l3fwd-power/main.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c index f8dfed1634..52f56dc405 100644 --- a/examples/l3fwd-power/main.c +++ b/examples/l3fwd-power/main.c @@ -2723,12 +2723,6 @@ main(int argc, char **argv) printf("\nInitializing rx queues on lcore %u ... ", lcore_id ); fflush(stdout); - /* PMD power management mode can only do 1 queue per core */ - if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) { - rte_exit(EXIT_FAILURE, - "In PMD power management mode, only one queue per lcore is allowed\n"); - } - /* init RX queues */ for(queue = 0; queue < qconf->n_rx_queue; ++queue) { struct rte_eth_rxconf rxq_conf; -- 2.25.1
[PATCH v1 1/1] malloc/mp: fix wait condition handling
>From coverity's point of view, it is theoretically possible to have an infinite wait on a wait condition because while we do check for timeout, we do not check for whether the event we are waiting for has already occurred by the time we get to the first cond_wait call (in this case, it's state of memory request list entry's state being set to COMPLETE). This can't really happen as the only time a wait condition is triggered is when we are receiving a memory event (so the entry we are waiting on cannot change before wait condition is triggered because it's protected by a mutex), so either we receive an event and modify entry state, or we exit wait on a timeout and do not care about request state. However, it's better to keep coverity happy. Coverity issue: 425709 Fixes: 07dcbfe0101f ("malloc: support multiprocess memory hotplug") Cc: sta...@dpdk.org Signed-off-by: Anatoly Burakov --- lib/eal/common/malloc_mp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/eal/common/malloc_mp.c b/lib/eal/common/malloc_mp.c index 2d39b0716f..9765277f5d 100644 --- a/lib/eal/common/malloc_mp.c +++ b/lib/eal/common/malloc_mp.c @@ -756,7 +756,8 @@ request_to_primary(struct malloc_mp_req *user_req) do { ret = pthread_cond_timedwait(&entry->cond, &mp_request_list.lock, &ts); - } while (ret != 0 && ret != ETIMEDOUT); + } while ((ret != 0 && ret != ETIMEDOUT) && + entry->state == REQ_STATE_ACTIVE); if (entry->state != REQ_STATE_COMPLETE) { EAL_LOG(ERR, "Request timed out"); -- 2.43.0
[PATCH v1 1/1] net/ice: fix E830 PTP phy model
Currently, we manually set PHY model in `ice_dev_init`, however we missed adding case for E830, so for E830 the initialization ends up calling E822 code instead. This results in incorrect phy model being set and having several downstream consequences for E830 as a result, ranging from a stray error message from attempting to start PHY timer, and up to inability to enable timesync on E830 devices. We could've fixed it by adding a case for E830, however there are several other missing bits of initialization (such as `phy_ports` field). All of this can be fixed by replacing manual setting of `phy_model` with a call to `ice_ptp_init_phy_model()`, which calls into base code and initializes the fields appropriately for all device types, including another option that is missing from current implementation - ETH56G. Fixes: c3bedb7114f2 ("net/ice/base: add E830 PTP initialization") Signed-off-by: Anatoly Burakov --- drivers/net/ice/ice_ethdev.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c index 29509b4329..304f959b7e 100644 --- a/drivers/net/ice/ice_ethdev.c +++ b/drivers/net/ice/ice_ethdev.c @@ -2496,10 +2496,8 @@ ice_dev_init(struct rte_eth_dev *dev) /* Initialize TM configuration */ ice_tm_conf_init(dev); - if (ice_is_e810(hw)) - hw->phy_model = ICE_PHY_E810; - else - hw->phy_model = ICE_PHY_E822; + /* Initialize PHY model */ + ice_ptp_init_phy_model(hw); if (hw->phy_model == ICE_PHY_E822) { ret = ice_start_phy_timer_e822(hw, hw->pf_id); -- 2.43.5
[RFC PATCH v1 0/1] Add Visual Studio Code configuration script
Lots of developers (myself included) uses Visual Studio Code as their primary IDE for DPDK development. I have been successfully using various incarnations of this script internally to quickly set up my development trees whenever I need a new configuration, so this script is being shared in hopes that it will be useful both to new developers starting with DPDK, and to seasoned DPDK developers who are already using Visual Studio Code. It makes starting working on DPDK in Visual Studio Code so much easier! Philosophy behind this script is as follows: - The assumption is made that a developer will not be using wildly different configurations from build to build - usually, they build the same things, work with the same set of apps/drivers for a while, then switch to something else, at which point a new configuration is needed - Some configurations I consider to be "common" are included: debug build, debug optimized build, release build with docs, and ASan build (feel free to make suggestions here!) - By default, the script will suggest enabling test, testpmd, and helloworld example - No drivers are being enabled by default - use needs to explicitly enable them (another option could be to leave things as default and build everything, but I rather prefer minimalistic builds as they're faster to compile, and it would be semantically weird to not have any drivers selected yet all of them being built) - All parameters that can be adjusted by TUI are also available as command line arguments, so while user interaction is the default (using whiptail), it's actually not required and can be bypassed. - I usually work as a local user not as root, so by default the script will attempt to use "gdbsudo" (a "sudo gdb $@" script in /usr/local/bin) for launch tasks, and stop if it is not available. Currently, it is only possible to define custom per-build configurations, while any "global" meson settings would have to involve editing settings.json file. This can be changed easily if required, but I've never needed this functionality. Please feel free to make any suggestions! Anatoly Burakov (1): devtools: add vscode configuration generator devtools/gen-vscode-config.py | 640 ++ 1 file changed, 640 insertions(+) create mode 100755 devtools/gen-vscode-config.py -- 2.43.5
[RFC PATCH v1 1/1] devtools: add vscode configuration generator
A lot of developers use Visual Studio Code as their primary IDE. This script generates a configuration file for VSCode that sets up basic build tasks, launch tasks, as well as C/C++ code analysis settings that will take into account compile_commands.json that is automatically generated by meson. Files generated by script: - .vscode/settings.json: stores variables needed by other files - .vscode/tasks.json: defines build tasks - .vscode/launch.json: defines launch tasks - .vscode/c_cpp_properties.json: defines code analysis settings The script uses a combination of globbing and meson file parsing to discover available apps, examples, and drivers, and generates a project-wide settings file, so that the user can later switch between debug/release/etc. configurations while keeping their desired apps, examples, and drivers, built by meson, and ensuring launch configurations still work correctly whatever the configuration selected. This script uses whiptail as TUI, which is expected to be universally available as it is shipped by default on most major distributions. However, the script is also designed to be scriptable and can be run without user interaction, and have its configuration supplied from command-line arguments. Signed-off-by: Anatoly Burakov --- devtools/gen-vscode-config.py | 640 ++ 1 file changed, 640 insertions(+) create mode 100755 devtools/gen-vscode-config.py diff --git a/devtools/gen-vscode-config.py b/devtools/gen-vscode-config.py new file mode 100755 index 00..0d291b6c17 --- /dev/null +++ b/devtools/gen-vscode-config.py @@ -0,0 +1,640 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2024 Intel Corporation +# + +"""Visual Studio Code configuration generator script.""" + +import os +import json +import argparse +import fnmatch +import shutil +from typing import List, Dict, Tuple, Any +from sys import exit as _exit, stderr +from subprocess import run, CalledProcessError, PIPE +from mesonbuild import mparser +from mesonbuild.mesonlib import MesonException + + +class DPDKBuildTask: +"""A build task for DPDK""" + +def __init__(self, label: str, description: str, param: str): +# label as it appears in build configuration +self.label = label +# description to be given in menu +self.description = description +# task-specific configuration parameters +self.param = param + +def to_json_dict(self) -> Dict[str, Any]: +"""Generate JSON dictionary for this task""" +return { +"label": f"Configure {self.label}", +"detail": self.description, +"type": "shell", +"dependsOn": "Remove builddir", +"command": f"meson setup ${{config:BUILDCONFIG}} {self.param} ${{config:BUILDDIR}}", +"problemMatcher": [], +"group": "build" +} + + +class CmdlineCtx: +"""POD class to set up command line parameters""" + +def __init__(self): +self.use_ui = False +self.use_gdbsudo = False +self.build_dir: str = "" +self.dpdk_dir: str = "" +self.gdb_path: str = "" + +self.avail_configs: List[Tuple[str, str, str]] = [] +self.avail_apps: List[str] = [] +self.avail_examples: List[str] = [] +self.avail_drivers: List[str] = [] + +self.enabled_configs: List[Tuple[str, str, str]] = [] +self.enabled_apps: List[str] = [] +self.enabled_examples: List[str] = [] +self.enabled_drivers: List[str] = [] + +self.driver_dep_map: Dict[str, List[str]] = {} + + +class DPDKLaunchTask: +"""A launch task for DPDK""" + +def __init__(self, label: str, exe: str, gdb_path: str): +# label as it appears in launch configuration +self.label = label +# path to executable +self.exe = exe +self.gdb_path = gdb_path + +def to_json_dict(self) -> Dict[str, Any]: +"""Generate JSON dictionary for this task""" +return { +"name": f"Run {self.label}", +"type": "cppdbg", +"request": "launch", +"program": f"${{config:BUILDDIR}}/{self.exe}", +"args": [], +"stopAtEntry": False, +"cwd": "${workspaceFolder}", +"externalConsole": False, +"preLaunchTask": "Build", +"MIMode": "gdb", +"miDebuggerPath": self.gdb_p
[RFC PATCH v2 0/1] Add Visual Studio Code configuration script
against this option. The third option is what I went with, with "smarter" being defined as follows: * User is allowed to use dialogs to edit configuration that is generated from parsing wildcard: if user changed something, we cannot keep wildcard any more and we assume user knows what they're doing and is OK with explicitly requesting compilation for drivers they selected. So, if user didn't change anything in the dialog, we keep the wildcard, otherwise we expand it. * If, by the time we get to resolving driver dependencies, we have wildcards in our driver param string, we see which drivers match this wildcard, and add wildcards for their dependencies. For example, if "net/ice" requires "common/iavf", and we have a "net/*" wildcard, one of the dependencies that we will add is "common/*". This behavior is, IMO, far better than the default one from our build system, where if a driver matches wildcard but cannot be built due to another internal dependency not being enabled (e.g. if "net/ice" is requested but "common/iavf" isn't requested), the build will fail to configure even though it would've been possible to build them otherwise So, explicitly enabled drivers get explicit dependencies, implicitly enabled drivers get implicit dependencies. The resulting build will be bigger than when using meson command line directly, but if the user is worried about build size, they can customize it via common meson parameters as well as being more granular about requested apps/examples/drivers. Thus, we address the "simple" usecase of "let's build everything by default", we handle some common use cases smarter than we otherwise would have, and we allow user to be as in-depth as they want by allowing to specify explicit meson command strings. I feel like this is a good compromise between usability and robustness. Please feel free to make any suggestions! [1] https://code.visualstudio.com/docs/remote/ssh Anatoly Burakov (1): devtools: add vscode configuration generator devtools/gen-vscode-config.py | 871 ++ 1 file changed, 871 insertions(+) create mode 100755 devtools/gen-vscode-config.py -- 2.43.5
[RFC PATCH v2 1/1] devtools: add vscode configuration generator
A lot of developers use Visual Studio Code as their primary IDE. This script generates a configuration file for VSCode that sets up basic build tasks, launch tasks, as well as C/C++ code analysis settings that will take into account compile_commands.json that is automatically generated by meson. Files generated by script: - .vscode/settings.json: stores variables needed by other files - .vscode/tasks.json: defines build tasks - .vscode/launch.json: defines launch tasks - .vscode/c_cpp_properties.json: defines code analysis settings The script uses a combination of globbing and meson file parsing to discover available apps, examples, and drivers, and generates a project-wide settings file, so that the user can later switch between debug/release/etc. configurations while keeping their desired apps, examples, and drivers, built by meson, and ensuring launch configurations still work correctly whatever the configuration selected. This script uses whiptail as TUI, which is expected to be universally available as it is shipped by default on most major distributions. However, the script is also designed to be scriptable and can be run without user interaction, and have its configuration supplied from command-line arguments. Signed-off-by: Anatoly Burakov --- Notes: RFCv1 -> RFCv2: - No longer disable apps and drivers if nothing was specified via command line or TUI, and warn user about things being built by default - Generate app launch configuration by default for when no apps are selected - Added paramters: - --force to avoid overwriting existing config - --common-conf to specify global meson flags applicable to all configs - --gdbsudo/--no-gdbsudo to specify gdbsudo behavior - Autodetect gdbsudo/gdb from UID - Updated comments, error messages, fixed issues with user interaction - Improved handling of wildcards and driver dependencies - Fixed a few bugs in dependency detection due to incorrect parsing - [Stephen] flake8 is happy devtools/gen-vscode-config.py | 871 ++ 1 file changed, 871 insertions(+) create mode 100755 devtools/gen-vscode-config.py diff --git a/devtools/gen-vscode-config.py b/devtools/gen-vscode-config.py new file mode 100755 index 00..f0d6044c1b --- /dev/null +++ b/devtools/gen-vscode-config.py @@ -0,0 +1,871 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2024 Intel Corporation +# + +"""Visual Studio Code configuration generator script.""" + +import os +import json +import argparse +import fnmatch +import shutil +from typing import List, Dict, Tuple, Any +from sys import exit as _exit, stderr +from subprocess import run, CalledProcessError, PIPE +from mesonbuild import mparser +from mesonbuild.mesonlib import MesonException + + +class DPDKBuildTask: +"""A build task for DPDK""" + +def __init__(self, label: str, description: str, param: str): +# label as it appears in build configuration +self.label = label +# description to be given in menu +self.description = description +# task-specific configuration parameters +self.param = param + +def to_json_dict(self) -> Dict[str, Any]: +"""Generate JSON dictionary for this task""" +return { +"label": f"Configure {self.label}", +"detail": self.description, +"type": "shell", +"dependsOn": "Remove builddir", +# take configuration from settings.json using config: namespace +"command": f"meson setup ${{config:BUILDCONFIG}} " \ + f"{self.param} ${{config:BUILDDIR}}", +"problemMatcher": [], +"group": "build" +} + + +class DPDKLaunchTask: +"""A launch task for DPDK""" + +def __init__(self, label: str, exe: str, gdb_path: str): +# label as it appears in launch configuration +self.label = label +# path to executable +self.exe = exe +self.gdb_path = gdb_path + +def to_json_dict(self) -> Dict[str, Any]: +"""Generate JSON dictionary for this task""" +return { +"name": f"Run {self.label}", +"type": "cppdbg", +"request": "launch", +# take configuration from settings.json using config: namespace +"program": f"${{config:BUILDDIR}}/{self.exe}", +"args": [], +"stopAtEntry": False, +"cwd": "${workspaceFolder}", +"externalConsole&
[RFC PATCH v3 0/1] Add Visual Studio Code configuration script
Lots of developers (myself included) uses Visual Studio Code as their primary IDE for DPDK development. I have been successfully using various incarnations of this script internally to quickly set up my development trees whenever I need a new configuration, so this script is being shared in hopes that it will be useful both to new developers starting with DPDK, and to seasoned DPDK developers who are already using Visual Studio Code. It makes starting working on DPDK in Visual Studio Code so much easier! ** NOTE: Currently, only x86 configuration is generated as I have no way to test the code analysis configuration on any other platforms. ** NOTE 2: this is not for *Visual Studio* the Windows IDE, this is for *Visual Studio Code* the cross-platform code editor. Specifically, main target audience for this script is people who either run DPDK directly on their Linux machine, or who use Remote SSH functionality to connect to a remote Linux machine and set up VSCode build there. No other OS's are currently supported by the script. (if you're unaware of what is Remote SSH, I highly suggest checking it out [1]) Philosophy behind this script is as follows: - Any build directory created will automatically add itself to VSCode configuration (ignore mechanism for e.g. test-meson-build.sh is WIP) - Launch configuration is created using `which gdb`, so by default non-root users will have to do additional system configuration for things to work - All of the interactive stuff has now been taken out and is planned to be included in a separate set of scripts, so this script now concerns itself only with adding build/launch targets to user's configuration and not much else Please feel free to make any suggestions! [1] https://code.visualstudio.com/docs/remote/ssh Anatoly Burakov (1): buildtools: add vscode configuration generator app/meson.build | 12 +- buildtools/gen-vscode-conf.py | 442 ++ buildtools/meson.build| 5 + examples/meson.build | 13 +- meson.build | 11 + 5 files changed, 481 insertions(+), 2 deletions(-) create mode 100755 buildtools/gen-vscode-conf.py -- 2.43.5
[RFC PATCH v3 1/1] buildtools: add vscode configuration generator
A lot of developers use Visual Studio Code as their primary IDE. This script will be called from within meson build process, and will generate a configuration file for VSCode that sets up basic build tasks, launch tasks, as well as C/C++ code analysis settings that will take into account compile_commands.json that is automatically generated by meson. Files generated by script: - .vscode/settings.json: stores variables needed by other files - .vscode/tasks.json: defines build tasks - .vscode/launch.json: defines launch tasks - .vscode/c_cpp_properties.json: defines code analysis settings Multiple, as well as out-of-source-tree, build directories are supported, and the script will generate separate configuration items for each build directory created by user, tagging them for convenience. Signed-off-by: Anatoly Burakov --- Notes: RFCv3 -> RFCv2: - Following feedback from Bruce, reworked to be minimal script run from meson - Moved to buildtools - Support for multiple build directories is now the default - All targets are automatically added to all configuration files RFCv1 -> RFCv2: - No longer disable apps and drivers if nothing was specified via command line or TUI, and warn user about things being built by default - Generate app launch configuration by default for when no apps are selected - Added paramters: - --force to avoid overwriting existing config - --common-conf to specify global meson flags applicable to all configs - --gdbsudo/--no-gdbsudo to specify gdbsudo behavior - Autodetect gdbsudo/gdb from UID - Updated comments, error messages, fixed issues with user interaction - Improved handling of wildcards and driver dependencies - Fixed a few bugs in dependency detection due to incorrect parsing - [Stephen] flake8 is happy app/meson.build | 12 +- buildtools/gen-vscode-conf.py | 442 ++ buildtools/meson.build| 5 + examples/meson.build | 13 +- meson.build | 11 + 5 files changed, 481 insertions(+), 2 deletions(-) create mode 100755 buildtools/gen-vscode-conf.py diff --git a/app/meson.build b/app/meson.build index 5b2c80c7a1..cf0eda3d5f 100644 --- a/app/meson.build +++ b/app/meson.build @@ -114,7 +114,17 @@ foreach app:apps link_libs = dpdk_static_libraries + dpdk_drivers endif -exec = executable('dpdk-' + name, +# add to Visual Studio Code launch configuration +exe_name = 'dpdk-' + name +launch_path = join_paths(meson.current_build_dir(), exe_name) +# we don't want to block the build if this command fails +result = run_command(vscode_conf_gen_cmd + ['--launch', launch_path], check: false) +if result.returncode() != 0 +warning('Failed to generate Visual Studio Code launch configuration for "' + name + '"') +message(result.stderr()) +endif + +exec = executable(exe_name, sources, c_args: cflags, link_args: ldflags, diff --git a/buildtools/gen-vscode-conf.py b/buildtools/gen-vscode-conf.py new file mode 100755 index 00..fcc6469065 --- /dev/null +++ b/buildtools/gen-vscode-conf.py @@ -0,0 +1,442 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2024 Intel Corporation +# + +"""Visual Studio Code configuration generator script.""" + +# This script is meant to be run by meson build system to generate build and +# launch commands for a specific build directory for Visual Studio Code IDE. +# +# Even though this script will generate settings/tasks/launch/code analysis +# configuration for VSCode, we can't actually just regenerate the files, +# because we want to support multiple build directories, as well as not +# destroy any configuration user has created between runs of this script. +# Therefore, we need some config file handling infrastructure. Luckily, VSCode +# configs are all JSON, so we can just use json module to handle them. Of +# course, we will lose any user comments in the files, but that's a small price +# to pay for this sort of automation. +# +# Since this script will be run by meson, we can forego any parsing or anything +# to do with the build system, and just rely on the fact that we get all of our +# configuration from command-line. + +import argparse +import ast +import json +import os +import shutil +from collections import OrderedDict +from sys import stderr, exit as _exit +from typing import List, Dict, Any + + +class ConfigCtx: +"""POD class to keep data associated with config.""" +def __init__(self, build_dir: str, source_dir: str, launch: List[str]): +self.build_dir = build_dir +self.source_dir = source_dir +self.config_dir = os.path.join(source_dir, '.vscode') +
[PATCH v1 1/2] usertools/cpu_layout: update coding style
Update coding style: - make it PEP-484 compliant - address all flake8, mypy etc. warnings - use f-strings in place of old-style string interpolation - refactor printing to make the code more readable Signed-off-by: Anatoly Burakov --- usertools/cpu_layout.py | 162 ++-- 1 file changed, 104 insertions(+), 58 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 891b9238fa..843b29a134 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -3,62 +3,108 @@ # Copyright(c) 2010-2014 Intel Corporation # Copyright(c) 2017 Cavium, Inc. All rights reserved. -sockets = [] -cores = [] -core_map = {} -base_path = "/sys/devices/system/cpu" -fd = open("{}/kernel_max".format(base_path)) -max_cpus = int(fd.read()) -fd.close() -for cpu in range(max_cpus + 1): -try: -fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu)) -except IOError: -continue -core = int(fd.read()) -fd.close() -fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu)) -socket = int(fd.read()) -fd.close() -if core not in cores: -cores.append(core) -if socket not in sockets: -sockets.append(socket) -key = (socket, core) -if key not in core_map: -core_map[key] = [] -core_map[key].append(cpu) - -print(format("=" * (47 + len(base_path -print("Core and Socket Information (as reported by '{}')".format(base_path)) -print("{}\n".format("=" * (47 + len(base_path -print("cores = ", cores) -print("sockets = ", sockets) -print("") - -max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1)) -max_thread_count = len(list(core_map.values())[0]) -max_core_map_len = (max_processor_len * max_thread_count) \ - + len(", ") * (max_thread_count - 1) \ - + len('[]') + len('Socket ') -max_core_id_len = len(str(max(cores))) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket ')) -print(output) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " ".ljust(max_core_map_len) -output += " " -print(output) - -for c in cores: -output = "Core %s" % str(c).ljust(max_core_id_len) -for s in sockets: -if (s, c) in core_map: -output += " " + str(core_map[(s, c)]).ljust(max_core_map_len) +from typing import List, Set, Dict, Tuple + + +def _range_expand(rstr: str) -> List[int]: +"""Expand a range string into a list of integers.""" +# 0,1-3 => [0, 1-3] +ranges = rstr.split(",") +valset: List[int] = [] +for r in ranges: +# 1-3 => [1, 2, 3] +if "-" in r: +start, end = r.split("-") +valset.extend(range(int(start), int(end) + 1)) else: -output += " " * (max_core_map_len + 1) -print(output) +valset.append(int(r)) +return valset + + +def _read_sysfs(path: str) -> str: +with open(path) as fd: +return fd.read().strip() + + +def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None: +first, *rest = row +w_first, *w_rest = col_widths +first_end = " " * 4 +rest_end = " " * 10 + +print(first.ljust(w_first), end=first_end) +for cell, width in zip(rest, w_rest): +print(cell.rjust(width), end=rest_end) +print() + + +def _print_section(heading: str) -> None: +sep = "=" * len(heading) +print(sep) +print(heading) +print(sep) +print() + + +def _main() -> None: +sockets_s: Set[int] = set() +cores_s: Set[int] = set() +core_map: Dict[Tuple[int, int], List[int]] = {} +base_path = "/sys/devices/system/cpu" + +cpus = _range_expand(_read_sysfs(f"{base_path}/online")) + +for cpu in cpus: +lcore_base = f"{base_path}/cpu{cpu}" +core = int(_read_sysfs(f"{lcore_base}/topology/core_id")) +socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id")) + +cores_s.add(core) +sockets_s.add(socket) +key = (socket, core) +core_map.setdefault(key, []) +core_map[key].append(cpu) + +cores = sorted(cores_s) +sockets = sorted(sockets_s) + +_print_section("Core and Socket Information " + f"(as reported by '{base_path}')") + +print("cores = ", cores) +print("sockets = ", sockets) +print() + +# Core, [Socket, Socket, ...] +heading_strs =
[PATCH v1 2/2] usertools/cpu_layout: print out NUMA nodes
In traditional NUMA case, NUMA nodes and physical sockets were used interchangeably, but there are cases where there can be multiple NUMA nodes per socket, as well as all CPU's being assigned NUMA node 0 even in cases of multiple sockets. Use sysfs to print out NUMA information. Signed-off-by: Anatoly Burakov --- usertools/cpu_layout.py | 35 ++- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 843b29a134..be89909464 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -4,6 +4,7 @@ # Copyright(c) 2017 Cavium, Inc. All rights reserved. from typing import List, Set, Dict, Tuple +import glob def _range_expand(rstr: str) -> List[int]: @@ -26,11 +27,19 @@ def _read_sysfs(path: str) -> str: return fd.read().strip() +def _read_numa_node(base: str) -> int: +node_glob = f"{base}/node*" +node_dirs = glob.glob(node_glob) +if not node_dirs: +return 0 # default to node 0 +return int(node_dirs[0].split("node")[1]) + + def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None: first, *rest = row w_first, *w_rest = col_widths first_end = " " * 4 -rest_end = " " * 10 +rest_end = " " * 4 print(first.ljust(w_first), end=first_end) for cell, width in zip(rest, w_rest): @@ -50,6 +59,7 @@ def _main() -> None: sockets_s: Set[int] = set() cores_s: Set[int] = set() core_map: Dict[Tuple[int, int], List[int]] = {} +numa_map: Dict[int, int] = {} base_path = "/sys/devices/system/cpu" cpus = _range_expand(_read_sysfs(f"{base_path}/online")) @@ -58,12 +68,14 @@ def _main() -> None: lcore_base = f"{base_path}/cpu{cpu}" core = int(_read_sysfs(f"{lcore_base}/topology/core_id")) socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id")) +node = _read_numa_node(lcore_base) cores_s.add(core) sockets_s.add(socket) key = (socket, core) core_map.setdefault(key, []) core_map[key].append(cpu) +numa_map[cpu] = node cores = sorted(cores_s) sockets = sorted(sockets_s) @@ -73,24 +85,37 @@ def _main() -> None: print("cores = ", cores) print("sockets = ", sockets) +print("numa = ", sorted(set(numa_map.values( print() -# Core, [Socket, Socket, ...] -heading_strs = "", *[f"Socket {s}" for s in sockets] +# Core, [NUMA, Socket, NUMA, Socket, ...] +heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")] sep_strs = tuple("-" * len(hstr) for hstr in heading_strs) rows: List[Tuple[str, ...]] = [] +prev_numa = None for c in cores: # Core, row: Tuple[str, ...] = (f"Core {c}",) -# [lcores, lcores, ...] +# assume NUMA changes symmetrically +first_lcore = core_map[(0, c)][0] +cur_numa = numa_map[first_lcore] +numa_changed = prev_numa != cur_numa +prev_numa = cur_numa + +# [NUMA, lcores, NUMA, lcores, ...] for s in sockets: try: lcores = core_map[(s, c)] +numa = numa_map[lcores[0]] +if numa_changed: +row += (f"NUMA {numa}",) +else: +row += ("",) row += (f"{lcores}",) except KeyError: -row += ("",) +row += ("", "") rows += [row] # find max widths for each column, including header and rows -- 2.43.5
[PATCH v2 1/4] usertools/cpu_layout: update coding style
Update coding style: - make it PEP-484 compliant - address all flake8, mypy etc. warnings - use f-strings in place of old-style string interpolation - refactor printing to make the code more readable Signed-off-by: Anatoly Burakov --- usertools/cpu_layout.py | 162 ++-- 1 file changed, 104 insertions(+), 58 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 891b9238fa..be86f06938 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -3,62 +3,108 @@ # Copyright(c) 2010-2014 Intel Corporation # Copyright(c) 2017 Cavium, Inc. All rights reserved. -sockets = [] -cores = [] -core_map = {} -base_path = "/sys/devices/system/cpu" -fd = open("{}/kernel_max".format(base_path)) -max_cpus = int(fd.read()) -fd.close() -for cpu in range(max_cpus + 1): -try: -fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu)) -except IOError: -continue -core = int(fd.read()) -fd.close() -fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu)) -socket = int(fd.read()) -fd.close() -if core not in cores: -cores.append(core) -if socket not in sockets: -sockets.append(socket) -key = (socket, core) -if key not in core_map: -core_map[key] = [] -core_map[key].append(cpu) - -print(format("=" * (47 + len(base_path -print("Core and Socket Information (as reported by '{}')".format(base_path)) -print("{}\n".format("=" * (47 + len(base_path -print("cores = ", cores) -print("sockets = ", sockets) -print("") - -max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1)) -max_thread_count = len(list(core_map.values())[0]) -max_core_map_len = (max_processor_len * max_thread_count) \ - + len(", ") * (max_thread_count - 1) \ - + len('[]') + len('Socket ') -max_core_id_len = len(str(max(cores))) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket ')) -print(output) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " ".ljust(max_core_map_len) -output += " " -print(output) - -for c in cores: -output = "Core %s" % str(c).ljust(max_core_id_len) -for s in sockets: -if (s, c) in core_map: -output += " " + str(core_map[(s, c)]).ljust(max_core_map_len) +from typing import List, Set, Dict, Tuple + + +def _range_expand(rstr: str) -> List[int]: +"""Expand a range string into a list of integers.""" +# 0,1-3 => [0, 1-3] +ranges = rstr.split(",") +valset: List[int] = [] +for r in ranges: +# 1-3 => [1, 2, 3] +if "-" in r: +start, end = r.split("-") +valset.extend(range(int(start), int(end) + 1)) else: -output += " " * (max_core_map_len + 1) -print(output) +valset.append(int(r)) +return valset + + +def _read_sysfs(path: str) -> str: +with open(path, encoding="utf-8") as fd: +return fd.read().strip() + + +def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None: +first, *rest = row +w_first, *w_rest = col_widths +first_end = " " * 4 +rest_end = " " * 10 + +print(first.ljust(w_first), end=first_end) +for cell, width in zip(rest, w_rest): +print(cell.rjust(width), end=rest_end) +print() + + +def _print_section(heading: str) -> None: +sep = "=" * len(heading) +print(sep) +print(heading) +print(sep) +print() + + +def _main() -> None: +sockets_s: Set[int] = set() +cores_s: Set[int] = set() +core_map: Dict[Tuple[int, int], List[int]] = {} +base_path = "/sys/devices/system/cpu" + +cpus = _range_expand(_read_sysfs(f"{base_path}/online")) + +for cpu in cpus: +lcore_base = f"{base_path}/cpu{cpu}" +core = int(_read_sysfs(f"{lcore_base}/topology/core_id")) +socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id")) + +cores_s.add(core) +sockets_s.add(socket) +key = (socket, core) +core_map.setdefault(key, []) +core_map[key].append(cpu) + +cores = sorted(cores_s) +sockets = sorted(sockets_s) + +_print_section("Core and Socket Information " + f"(as reported by '{base_path}')") + +print("cores = ", cores) +print("sockets = ", sockets) +print() + +# Core,
[PATCH v2 2/4] usertools/cpu_layout: print out NUMA nodes
In traditional NUMA case, NUMA nodes and physical sockets were used interchangeably, but there are cases where there can be multiple NUMA nodes per socket, as well as all CPU's being assigned NUMA node 0 even in cases of multiple sockets. Use sysfs to print out NUMA information. Signed-off-by: Anatoly Burakov --- usertools/cpu_layout.py | 35 ++- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index be86f06938..e43bdbf343 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -4,6 +4,7 @@ # Copyright(c) 2017 Cavium, Inc. All rights reserved. from typing import List, Set, Dict, Tuple +import glob def _range_expand(rstr: str) -> List[int]: @@ -26,11 +27,19 @@ def _read_sysfs(path: str) -> str: return fd.read().strip() +def _read_numa_node(base: str) -> int: +node_glob = f"{base}/node*" +node_dirs = glob.glob(node_glob) +if not node_dirs: +return 0 # default to node 0 +return int(node_dirs[0].split("node")[1]) + + def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None: first, *rest = row w_first, *w_rest = col_widths first_end = " " * 4 -rest_end = " " * 10 +rest_end = " " * 4 print(first.ljust(w_first), end=first_end) for cell, width in zip(rest, w_rest): @@ -50,6 +59,7 @@ def _main() -> None: sockets_s: Set[int] = set() cores_s: Set[int] = set() core_map: Dict[Tuple[int, int], List[int]] = {} +numa_map: Dict[int, int] = {} base_path = "/sys/devices/system/cpu" cpus = _range_expand(_read_sysfs(f"{base_path}/online")) @@ -58,12 +68,14 @@ def _main() -> None: lcore_base = f"{base_path}/cpu{cpu}" core = int(_read_sysfs(f"{lcore_base}/topology/core_id")) socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id")) +node = _read_numa_node(lcore_base) cores_s.add(core) sockets_s.add(socket) key = (socket, core) core_map.setdefault(key, []) core_map[key].append(cpu) +numa_map[cpu] = node cores = sorted(cores_s) sockets = sorted(sockets_s) @@ -73,24 +85,37 @@ def _main() -> None: print("cores = ", cores) print("sockets = ", sockets) +print("numa = ", sorted(set(numa_map.values( print() -# Core, [Socket, Socket, ...] -heading_strs = "", *[f"Socket {s}" for s in sockets] +# Core, [NUMA, Socket, NUMA, Socket, ...] +heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")] sep_strs = tuple("-" * len(hstr) for hstr in heading_strs) rows: List[Tuple[str, ...]] = [] +prev_numa = None for c in cores: # Core, row: Tuple[str, ...] = (f"Core {c}",) -# [lcores, lcores, ...] +# assume NUMA changes symmetrically +first_lcore = core_map[(0, c)][0] +cur_numa = numa_map[first_lcore] +numa_changed = prev_numa != cur_numa +prev_numa = cur_numa + +# [NUMA, lcores, NUMA, lcores, ...] for s in sockets: try: lcores = core_map[(s, c)] +numa = numa_map[lcores[0]] +if numa_changed: +row += (f"NUMA {numa}",) +else: +row += ("",) row += (str(lcores),) except KeyError: -row += ("",) +row += ("", "") rows += [row] # find max widths for each column, including header and rows -- 2.43.5
[PATCH v2 3/4] usertools/dpdk-hugepages.py: sort by NUMA node
Currently, the list of per-NUMA node hugepages is displayed in glob order, which can be arbitrary. Fix it to sort the glob order. Signed-off-by: Anatoly Burakov --- usertools/dpdk-hugepages.py | 40 ++--- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py index bf2575ba36..54232ddf22 100755 --- a/usertools/dpdk-hugepages.py +++ b/usertools/dpdk-hugepages.py @@ -74,21 +74,37 @@ def set_hugepages(path, reqpages): gotpages, reqpages, filename)) +def get_numa_pages_node(node): +'''Read list of hugepage reservations on specific NUMA node''' +hp_path = f'/sys/devices/system/node/node{node}/hugepages' +if not os.path.exists(hp_path): +return +res = [] +for pg_sz_dir in os.listdir(hp_path): +pg_sz = int(pg_sz_dir[10:-2]) +nr_pages = get_hugepages(f'{hp_path}/{pg_sz_dir}') +if nr_pages > 0: +pg_sz_str = fmt_memsize(pg_sz) +total_sz_str = fmt_memsize(nr_pages * pg_sz) +res += [(nr_pages, pg_sz_str, total_sz_str)] +else: +res += [(0, None, None)] +return res + + def show_numa_pages(): '''Show huge page reservations on Numa system''' +# get list of NUMA nodes and sort them by integer order print('Node Pages Size Total') -for numa_path in glob.glob('/sys/devices/system/node/node*'): -node = numa_path[29:] # slice after /sys/devices/system/node/node -path = numa_path + '/hugepages' -if not os.path.exists(path): -continue -for hdir in os.listdir(path): -pages = get_hugepages(path + '/' + hdir) -if pages > 0: -kb = int(hdir[10:-2]) # slice out of hugepages-NNNkB -print('{:<4} {:<5} {:<6} {}'.format(node, pages, -fmt_memsize(kb), -fmt_memsize(pages * kb))) +nodes = sorted(int(node[29:]) + for node in glob.glob('/sys/devices/system/node/node*')) +for node in nodes: +pg_sz_data = get_numa_pages_node(node) +for nr_pages, pg_sz, total_sz in pg_sz_data: +if not nr_pages: +continue +print('{:<4} {:<5} {:<6} {}' + .format(node, nr_pages, pg_sz, total_sz)) def show_non_numa_pages(): -- 2.43.5
[PATCH v2 4/4] usertools/dpdk-devbind: print NUMA node
Currently, devbind does not print out any NUMA information, which makes figuring out which NUMA node device belongs to not trivial. Add printouts for NUMA information if NUMA support is enabled on the system. Signed-off-by: Anatoly Burakov --- usertools/dpdk-devbind.py | 27 +++ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py index b276e8efc8..c0611a501d 100755 --- a/usertools/dpdk-devbind.py +++ b/usertools/dpdk-devbind.py @@ -110,6 +110,11 @@ args = [] +# check if this system has NUMA support +def is_numa(): +return os.path.exists('/sys/devices/system/node') + + # check if a specific kernel module is loaded def module_is_loaded(module): global loaded_modules @@ -579,18 +584,24 @@ def show_device_status(devices_type, device_name, if_field=False): # print each category separately, so we can clearly see what's used by DPDK if dpdk_drv: +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" +if is_numa(): +extra_param = "numa_node=%(NUMANode)s " + extra_param display_devices("%s devices using DPDK-compatible driver" % device_name, -dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s") +dpdk_drv, extra_param) if kernel_drv: -if_text = "" +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" if if_field: -if_text = "if=%(Interface)s " -display_devices("%s devices using kernel driver" % device_name, kernel_drv, -if_text + "drv=%(Driver_str)s " -"unused=%(Module_str)s %(Active)s") +extra_param = "if=%(Interface)s " + extra_param +if is_numa(): +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("%s devices using kernel driver" % device_name, +kernel_drv, extra_param) if no_drv: -display_devices("Other %s devices" % device_name, no_drv, -"unused=%(Module_str)s") +extra_param = "unused=%(Module_str)s" +if is_numa(): +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("Other %s devices" % device_name, no_drv, extra_param) def show_status(): -- 2.43.5
[PATCH v3 1/4] usertools/cpu_layout: update coding style
Update coding style: - make it PEP-484 compliant - address all flake8, mypy etc. warnings - use f-strings in place of old-style string interpolation - refactor printing to make the code more readable - read valid CPU ID's from "online" sysfs node Signed-off-by: Anatoly Burakov --- Notes: v1,v2 -> v3: - Import typing as T instead of individual types usertools/cpu_layout.py | 162 ++-- 1 file changed, 107 insertions(+), 55 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 891b9238fa..1c255ff1a1 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -3,62 +3,114 @@ # Copyright(c) 2010-2014 Intel Corporation # Copyright(c) 2017 Cavium, Inc. All rights reserved. -sockets = [] -cores = [] -core_map = {} -base_path = "/sys/devices/system/cpu" -fd = open("{}/kernel_max".format(base_path)) -max_cpus = int(fd.read()) -fd.close() -for cpu in range(max_cpus + 1): -try: -fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu)) -except IOError: -continue -core = int(fd.read()) -fd.close() -fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu)) -socket = int(fd.read()) -fd.close() -if core not in cores: -cores.append(core) -if socket not in sockets: -sockets.append(socket) -key = (socket, core) -if key not in core_map: -core_map[key] = [] -core_map[key].append(cpu) +"""Display CPU topology information.""" -print(format("=" * (47 + len(base_path -print("Core and Socket Information (as reported by '{}')".format(base_path)) -print("{}\n".format("=" * (47 + len(base_path -print("cores = ", cores) -print("sockets = ", sockets) -print("") +import typing as T -max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1)) -max_thread_count = len(list(core_map.values())[0]) -max_core_map_len = (max_processor_len * max_thread_count) \ - + len(", ") * (max_thread_count - 1) \ - + len('[]') + len('Socket ') -max_core_id_len = len(str(max(cores))) -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket ')) -print(output) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " ".ljust(max_core_map_len) -output += " " -print(output) - -for c in cores: -output = "Core %s" % str(c).ljust(max_core_id_len) -for s in sockets: -if (s, c) in core_map: -output += " " + str(core_map[(s, c)]).ljust(max_core_map_len) +def range_expand(rstr: str) -> T.List[int]: +"""Expand a range string into a list of integers.""" +# 0,1-3 => [0, 1-3] +ranges = rstr.split(",") +valset: T.List[int] = [] +for r in ranges: +# 1-3 => [1, 2, 3] +if "-" in r: +start, end = r.split("-") +valset.extend(range(int(start), int(end) + 1)) else: -output += " " * (max_core_map_len + 1) -print(output) +valset.append(int(r)) +return valset + + +def read_sysfs(path: str) -> str: +"""Read a sysfs file and return its contents.""" +with open(path, encoding="utf-8") as fd: +return fd.read().strip() + + +def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: +"""Print a row of a table with the given column widths.""" +first, *rest = row +w_first, *w_rest = col_widths +first_end = " " * 4 +rest_end = " " * 10 + +print(first.ljust(w_first), end=first_end) +for cell, width in zip(rest, w_rest): +print(cell.rjust(width), end=rest_end) +print() + + +def print_section(heading: str) -> None: +"""Print a section heading.""" +sep = "=" * len(heading) +print(sep) +print(heading) +print(sep) +print() + + +def main() -> None: +"""Print CPU topology information.""" +sockets_s: T.Set[int] = set() +cores_s: T.Set[int] = set() +core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +base_path = "/sys/devices/system/cpu" + +cpus = range_expand(read_sysfs(f"{base_path}/online")) + +for cpu in cpus: +lcore_base = f"{base_path}/cpu{cpu}" +core = int(read_sysfs(f"{lcore_base}/topology/core_id")) +socket = int(read_sysfs(f"{lcore_base}/topolog
[PATCH v3 2/4] usertools/cpu_layout: print out NUMA nodes
In traditional NUMA case, NUMA nodes and physical sockets were used interchangeably, but there are cases where there can be multiple NUMA nodes per socket, as well as all CPU's being assigned NUMA node 0 even in cases of multiple sockets. Use sysfs to print out NUMA information. Signed-off-by: Anatoly Burakov --- Notes: v2 -> v3: - Sort imports alphabetically usertools/cpu_layout.py | 36 +++- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 1c255ff1a1..78b119d729 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -5,6 +5,7 @@ """Display CPU topology information.""" +import glob import typing as T @@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str: return fd.read().strip() +def read_numa_node(base: str) -> int: +"""Read the NUMA node of a CPU.""" +node_glob = f"{base}/node*" +node_dirs = glob.glob(node_glob) +if not node_dirs: +return 0 # default to node 0 +return int(node_dirs[0].split("node")[1]) + + def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: """Print a row of a table with the given column widths.""" first, *rest = row w_first, *w_rest = col_widths first_end = " " * 4 -rest_end = " " * 10 +rest_end = " " * 4 print(first.ljust(w_first), end=first_end) for cell, width in zip(rest, w_rest): @@ -56,6 +66,7 @@ def main() -> None: sockets_s: T.Set[int] = set() cores_s: T.Set[int] = set() core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +numa_map: T.Dict[int, int] = {} base_path = "/sys/devices/system/cpu" cpus = range_expand(read_sysfs(f"{base_path}/online")) @@ -64,12 +75,14 @@ def main() -> None: lcore_base = f"{base_path}/cpu{cpu}" core = int(read_sysfs(f"{lcore_base}/topology/core_id")) socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id")) +node = read_numa_node(lcore_base) cores_s.add(core) sockets_s.add(socket) key = (socket, core) core_map.setdefault(key, []) core_map[key].append(cpu) +numa_map[cpu] = node cores = sorted(cores_s) sockets = sorted(sockets_s) @@ -79,24 +92,37 @@ def main() -> None: print("cores = ", cores) print("sockets = ", sockets) +print("numa = ", sorted(set(numa_map.values( print() -# Core, [Socket, Socket, ...] -heading_strs = "", *[f"Socket {s}" for s in sockets] +# Core, [NUMA, Socket, NUMA, Socket, ...] +heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")] sep_strs = tuple("-" * len(hstr) for hstr in heading_strs) rows: T.List[T.Tuple[str, ...]] = [] +prev_numa = None for c in cores: # Core, row: T.Tuple[str, ...] = (f"Core {c}",) -# [lcores, lcores, ...] +# assume NUMA changes symmetrically +first_lcore = core_map[(0, c)][0] +cur_numa = numa_map[first_lcore] +numa_changed = prev_numa != cur_numa +prev_numa = cur_numa + +# [NUMA, lcores, NUMA, lcores, ...] for s in sockets: try: lcores = core_map[(s, c)] +numa = numa_map[lcores[0]] +if numa_changed: +row += (f"NUMA {numa}",) +else: +row += ("",) row += (str(lcores),) except KeyError: -row += ("",) +row += ("", "") rows += [row] # find max widths for each column, including header and rows -- 2.43.5
[PATCH v3 3/4] usertools/dpdk-hugepages.py: update coding style
Update coding style: - Make the code PEP-484 compliant - Add more comments, improve readability, use f-strings everywhere - Use quotes consistently - Address all Python static analysis (e.g. mypy, pylint) warnings - Improve error handling - Refactor printing and sysfs/procfs access functions - Sort output by NUMA node Signed-off-by: Anatoly Burakov --- Notes: v1 -> v2: - Added commit that sorted output by NUMA node v2 -> v3: - Rewrite of the script as suggested by reviewers usertools/dpdk-hugepages.py | 456 +--- 1 file changed, 273 insertions(+), 183 deletions(-) diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py index bf2575ba36..510822af60 100755 --- a/usertools/dpdk-hugepages.py +++ b/usertools/dpdk-hugepages.py @@ -1,167 +1,136 @@ #! /usr/bin/env python3 # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2020 Microsoft Corporation -"""Script to query and setup huge pages for DPDK applications.""" + +'''Script to query and setup huge pages for DPDK applications.''' import argparse -import glob import os import re +import subprocess import sys +import typing as T from math import log2 # Standard binary prefix -BINARY_PREFIX = "KMG" +BINARY_PREFIX = 'KMG' # systemd mount point for huge pages -HUGE_MOUNT = "/dev/hugepages" +HUGE_MOUNT = '/dev/hugepages' +# default directory for non-NUMA huge pages +NO_NUMA_HUGE_DIR = '/sys/kernel/mm/hugepages' +# default base directory for NUMA nodes +NUMA_NODE_BASE_DIR = '/sys/devices/system/node' +# procfs paths +MEMINFO_PATH = '/proc/meminfo' +MOUNTS_PATH = '/proc/mounts' -def fmt_memsize(kb): +class HugepageMount: +'''Mount operations for huge page filesystem.''' + +def __init__(self, path: str, mounted: bool): +self.path = path +# current mount status +self.mounted = mounted + +def mount(self, pagesize_kb: int, + user: T.Optional[str], group: T.Optional[str]) -> None: +'''Mount the huge TLB file system''' +if self.mounted: +return +cmd = ['mount', '-t', 'hugetlbfs'] +cmd += ['-o', f'pagesize={pagesize_kb * 1024}'] +if user is not None: +cmd += ['-o', f'uid={user}'] +if group is not None: +cmd += ['-o', f'gid={group}'] +cmd += ['nodev', self.path] + +subprocess.run(cmd, check=True) +self.mounted = True + +def unmount(self) -> None: +'''Unmount the huge TLB file system (if mounted)''' +if self.mounted: +subprocess.run(['umount', self.path], check=True) +self.mounted = False + + +class HugepageRes: +'''Huge page reserve operations. Can be NUMA-node-specific.''' + +def __init__(self, path: str, node: T.Optional[int] = None): +self.path = path +# if this is a per-NUMA node huge page dir, store the node number +self.node = node +self.valid_page_sizes = self._get_valid_page_sizes() + +def _get_valid_page_sizes(self) -> T.List[int]: +'''Extract valid huge page sizes''' +return [get_memsize(d.split('-')[1]) +for d in os.listdir(self.path)] + +def _nr_pages_path(self, sz: int) -> str: +if sz not in self.valid_page_sizes: +raise ValueError(f'Invalid page size {sz}. ' + f'Valid sizes: {self.valid_page_sizes}') +return os.path.join(self.path, f'hugepages-{sz}kB', 'nr_hugepages') + +def __getitem__(self, sz: int) -> int: +'''Get current number of reserved pages of specified size''' +with open(self._nr_pages_path(sz), encoding='utf-8') as f: +return int(f.read()) + +def __setitem__(self, sz: int, nr_pages: int) -> None: +'''Set number of reserved pages of specified size''' +with open(self._nr_pages_path(sz), 'w', encoding='utf-8') as f: +f.write(f'{nr_pages}\n') + + +def fmt_memsize(kb: int) -> str: '''Format memory size in kB into conventional format''' logk = int(log2(kb) / 10) suffix = BINARY_PREFIX[logk] unit = 2**(logk * 10) -return '{}{}b'.format(int(kb / unit), suffix) +return f'{int(kb / unit)}{suffix}b' -def get_memsize(arg): +def get_memsize(arg: str) -> int: '''Convert memory size with suffix to kB'
[PATCH v3 4/4] usertools/dpdk-devbind: print NUMA node
Currently, devbind does not print out any NUMA information, which makes figuring out which NUMA node device belongs to not trivial. Add printouts for NUMA information if NUMA support is enabled on the system. Signed-off-by: Anatoly Burakov Acked-by: Robin Jarry --- Notes: v1 -> v2: - Added commit to print out NUMA information in devbind usertools/dpdk-devbind.py | 29 + 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py index b276e8efc8..078e8c387b 100755 --- a/usertools/dpdk-devbind.py +++ b/usertools/dpdk-devbind.py @@ -110,6 +110,11 @@ args = [] +# check if this system has NUMA support +def is_numa(): +return os.path.exists('/sys/devices/system/node') + + # check if a specific kernel module is loaded def module_is_loaded(module): global loaded_modules @@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, if_field=False): print("".join('=' * len(msg))) return +print_numa = is_numa() + # print each category separately, so we can clearly see what's used by DPDK if dpdk_drv: +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param display_devices("%s devices using DPDK-compatible driver" % device_name, -dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s") +dpdk_drv, extra_param) if kernel_drv: -if_text = "" +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" if if_field: -if_text = "if=%(Interface)s " -display_devices("%s devices using kernel driver" % device_name, kernel_drv, -if_text + "drv=%(Driver_str)s " -"unused=%(Module_str)s %(Active)s") +extra_param = "if=%(Interface)s " + extra_param +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("%s devices using kernel driver" % device_name, +kernel_drv, extra_param) if no_drv: -display_devices("Other %s devices" % device_name, no_drv, -"unused=%(Module_str)s") +extra_param = "unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("Other %s devices" % device_name, no_drv, extra_param) def show_status(): -- 2.43.5
[PATCH v4 2/4] usertools/cpu_layout: print out NUMA nodes
In traditional NUMA case, NUMA nodes and physical sockets were used interchangeably, but there are cases where there can be multiple NUMA nodes per socket, as well as all CPU's being assigned NUMA node 0 even in cases of multiple sockets. Use sysfs to print out NUMA information. Signed-off-by: Anatoly Burakov --- Notes: v2 -> v3: - Sort imports alphabetically usertools/cpu_layout.py | 36 +++- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 8812ea286b..e4720e27db 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -5,6 +5,7 @@ """Display CPU topology information.""" +import glob import typing as T @@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str: return fd.read().strip() +def read_numa_node(base: str) -> int: +"""Read the NUMA node of a CPU.""" +node_glob = f"{base}/node*" +node_dirs = glob.glob(node_glob) +if not node_dirs: +return 0 # default to node 0 +return int(node_dirs[0].split("node")[1]) + + def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: """Print a row of a table with the given column widths.""" first, *rest = row w_first, *w_rest = col_widths first_end = " " * 4 -rest_end = " " * 10 +rest_end = " " * 4 print(first.ljust(w_first), end=first_end) for cell, width in zip(rest, w_rest): @@ -56,6 +66,7 @@ def main() -> None: sockets_s: T.Set[int] = set() cores_s: T.Set[int] = set() core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +numa_map: T.Dict[int, int] = {} base_path = "/sys/devices/system/cpu" cpus = range_expand(read_sysfs(f"{base_path}/online")) @@ -64,12 +75,14 @@ def main() -> None: lcore_base = f"{base_path}/cpu{cpu}" core = int(read_sysfs(f"{lcore_base}/topology/core_id")) socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id")) +node = read_numa_node(lcore_base) cores_s.add(core) sockets_s.add(socket) key = (socket, core) core_map.setdefault(key, []) core_map[key].append(cpu) +numa_map[cpu] = node cores = sorted(cores_s) sockets = sorted(sockets_s) @@ -80,24 +93,37 @@ def main() -> None: print("cores = ", cores) print("sockets = ", sockets) +print("numa = ", sorted(set(numa_map.values( print() -# Core, [Socket, Socket, ...] -heading_strs = "", *[f"Socket {s}" for s in sockets] +# Core, [NUMA, Socket, NUMA, Socket, ...] +heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")] sep_strs = tuple("-" * len(hstr) for hstr in heading_strs) rows: T.List[T.Tuple[str, ...]] = [] +prev_numa = None for c in cores: # Core, row: T.Tuple[str, ...] = (f"Core {c}",) -# [lcores, lcores, ...] +# assume NUMA changes symmetrically +first_lcore = core_map[(0, c)][0] +cur_numa = numa_map[first_lcore] +numa_changed = prev_numa != cur_numa +prev_numa = cur_numa + +# [NUMA, lcores, NUMA, lcores, ...] for s in sockets: try: lcores = core_map[(s, c)] +numa = numa_map[lcores[0]] +if numa_changed: +row += (f"NUMA {numa}",) +else: +row += ("",) row += (str(lcores),) except KeyError: -row += ("",) +row += ("", "") rows += [row] # find max widths for each column, including header and rows -- 2.43.5
[PATCH v4 4/4] usertools/dpdk-devbind: print NUMA node
Currently, devbind does not print out any NUMA information, which makes figuring out which NUMA node device belongs to not trivial. Add printouts for NUMA information if NUMA support is enabled on the system. Signed-off-by: Anatoly Burakov Acked-by: Robin Jarry --- Notes: v1 -> v2: - Added commit to print out NUMA information in devbind usertools/dpdk-devbind.py | 29 + 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py index b276e8efc8..078e8c387b 100755 --- a/usertools/dpdk-devbind.py +++ b/usertools/dpdk-devbind.py @@ -110,6 +110,11 @@ args = [] +# check if this system has NUMA support +def is_numa(): +return os.path.exists('/sys/devices/system/node') + + # check if a specific kernel module is loaded def module_is_loaded(module): global loaded_modules @@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, if_field=False): print("".join('=' * len(msg))) return +print_numa = is_numa() + # print each category separately, so we can clearly see what's used by DPDK if dpdk_drv: +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param display_devices("%s devices using DPDK-compatible driver" % device_name, -dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s") +dpdk_drv, extra_param) if kernel_drv: -if_text = "" +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" if if_field: -if_text = "if=%(Interface)s " -display_devices("%s devices using kernel driver" % device_name, kernel_drv, -if_text + "drv=%(Driver_str)s " -"unused=%(Module_str)s %(Active)s") +extra_param = "if=%(Interface)s " + extra_param +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("%s devices using kernel driver" % device_name, +kernel_drv, extra_param) if no_drv: -display_devices("Other %s devices" % device_name, no_drv, -"unused=%(Module_str)s") +extra_param = "unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("Other %s devices" % device_name, no_drv, extra_param) def show_status(): -- 2.43.5
[PATCH v4 3/4] usertools/dpdk-hugepages.py: update coding style
Update coding style: - Make the code PEP-484 compliant - Add more comments, improve readability, use f-strings everywhere - Address all Python static analysis (e.g. mypy, pylint) warnings - Format code with Ruff - Improve error handling - Refactor printing and sysfs/procfs access functions - Sort output by NUMA node Signed-off-by: Anatoly Burakov Acked-by: Stephen Hemminger --- Notes: v3 -> v4: - Format code with Ruff, line width 79 to avoid flake8 warnings (Flake8 is by default configured with line width 79 on my system) v2 -> v3: - Rewrite of the script as suggested by reviewers v1 -> v2: - Added commit that sorted output by NUMA node usertools/dpdk-hugepages.py | 524 ++-- 1 file changed, 315 insertions(+), 209 deletions(-) diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py index bf2575ba36..4c99682848 100755 --- a/usertools/dpdk-hugepages.py +++ b/usertools/dpdk-hugepages.py @@ -1,13 +1,15 @@ #! /usr/bin/env python3 # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2020 Microsoft Corporation + """Script to query and setup huge pages for DPDK applications.""" import argparse -import glob import os import re +import subprocess import sys +import typing as T from math import log2 # Standard binary prefix @@ -15,194 +17,268 @@ # systemd mount point for huge pages HUGE_MOUNT = "/dev/hugepages" +# default directory for non-NUMA huge pages +NO_NUMA_HUGE_DIR = "/sys/kernel/mm/hugepages" +# default base directory for NUMA nodes +NUMA_NODE_BASE_DIR = "/sys/devices/system/node" +# procfs paths +MEMINFO_PATH = "/proc/meminfo" +MOUNTS_PATH = "/proc/mounts" -def fmt_memsize(kb): -'''Format memory size in kB into conventional format''' +class HugepageMount: +"""Mount operations for huge page filesystem.""" + +def __init__(self, path: str, mounted: bool): +self.path = path +# current mount status +self.mounted = mounted + +def mount( +self, pagesize_kb: int, user: T.Optional[str], group: T.Optional[str] +) -> None: +"""Mount the huge TLB file system""" +if self.mounted: +return +cmd = ["mount", "-t", "hugetlbfs"] +cmd += ["-o", f"pagesize={pagesize_kb * 1024}"] +if user is not None: +cmd += ["-o", f"uid={user}"] +if group is not None: +cmd += ["-o", f"gid={group}"] +cmd += ["nodev", self.path] + +subprocess.run(cmd, check=True) +self.mounted = True + +def unmount(self) -> None: +"""Unmount the huge TLB file system (if mounted)""" +if self.mounted: +subprocess.run(["umount", self.path], check=True) +self.mounted = False + + +class HugepageRes: +"""Huge page reserve operations. Can be NUMA-node-specific.""" + +def __init__(self, path: str, node: T.Optional[int] = None): +self.path = path +# if this is a per-NUMA node huge page dir, store the node number +self.node = node +self.valid_page_sizes = self._get_valid_page_sizes() + +def _get_valid_page_sizes(self) -> T.List[int]: +"""Extract valid huge page sizes""" +return [get_memsize(d.split("-")[1]) for d in os.listdir(self.path)] + +def _nr_pages_path(self, sz: int) -> str: +if sz not in self.valid_page_sizes: +raise ValueError( +f"Invalid page size {sz}. " +f"Valid sizes: {self.valid_page_sizes}" +) +return os.path.join(self.path, f"hugepages-{sz}kB", "nr_hugepages") + +def __getitem__(self, sz: int) -> int: +"""Get current number of reserved pages of specified size""" +with open(self._nr_pages_path(sz), encoding="utf-8") as f: +return int(f.read()) + +def __setitem__(self, sz: int, nr_pages: int) -> None: +"""Set number of reserved pages of specified size""" +with open(self._nr_pages_path(sz), "w", encoding="utf-8") as f: +f.write(f"{nr_pages}\n") + + +def fmt_memsize(kb: int) -> str: +"""Format memory size in kB into conventional format""" logk = int(log2(kb) / 10) suffix = BINARY_PREFIX[logk] -unit = 2**(logk * 10) -return '{}{}b'.format(int(kb / unit), suffix) +unit = 2 ** (logk * 10) +return f"{int(kb / unit)}{suffix}b"
[PATCH v4 1/4] usertools/cpu_layout: update coding style
Update coding style: - make it PEP-484 compliant - address all flake8, mypy etc. warnings - use f-strings in place of old-style string interpolation - refactor printing to make the code more readable - read valid CPU ID's from "online" sysfs node Signed-off-by: Anatoly Burakov --- Notes: v3->v4: - Format with Ruff, line width 79 v1,v2 -> v3: - Import typing as T instead of individual types usertools/cpu_layout.py | 163 ++-- 1 file changed, 108 insertions(+), 55 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 891b9238fa..8812ea286b 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -3,62 +3,115 @@ # Copyright(c) 2010-2014 Intel Corporation # Copyright(c) 2017 Cavium, Inc. All rights reserved. -sockets = [] -cores = [] -core_map = {} -base_path = "/sys/devices/system/cpu" -fd = open("{}/kernel_max".format(base_path)) -max_cpus = int(fd.read()) -fd.close() -for cpu in range(max_cpus + 1): -try: -fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu)) -except IOError: -continue -core = int(fd.read()) -fd.close() -fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu)) -socket = int(fd.read()) -fd.close() -if core not in cores: -cores.append(core) -if socket not in sockets: -sockets.append(socket) -key = (socket, core) -if key not in core_map: -core_map[key] = [] -core_map[key].append(cpu) +"""Display CPU topology information.""" -print(format("=" * (47 + len(base_path -print("Core and Socket Information (as reported by '{}')".format(base_path)) -print("{}\n".format("=" * (47 + len(base_path -print("cores = ", cores) -print("sockets = ", sockets) -print("") +import typing as T -max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1)) -max_thread_count = len(list(core_map.values())[0]) -max_core_map_len = (max_processor_len * max_thread_count) \ - + len(", ") * (max_thread_count - 1) \ - + len('[]') + len('Socket ') -max_core_id_len = len(str(max(cores))) -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket ')) -print(output) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " ".ljust(max_core_map_len) -output += " " -print(output) - -for c in cores: -output = "Core %s" % str(c).ljust(max_core_id_len) -for s in sockets: -if (s, c) in core_map: -output += " " + str(core_map[(s, c)]).ljust(max_core_map_len) +def range_expand(rstr: str) -> T.List[int]: +"""Expand a range string into a list of integers.""" +# 0,1-3 => [0, 1-3] +ranges = rstr.split(",") +valset: T.List[int] = [] +for r in ranges: +# 1-3 => [1, 2, 3] +if "-" in r: +start, end = r.split("-") +valset.extend(range(int(start), int(end) + 1)) else: -output += " " * (max_core_map_len + 1) -print(output) +valset.append(int(r)) +return valset + + +def read_sysfs(path: str) -> str: +"""Read a sysfs file and return its contents.""" +with open(path, encoding="utf-8") as fd: +return fd.read().strip() + + +def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: +"""Print a row of a table with the given column widths.""" +first, *rest = row +w_first, *w_rest = col_widths +first_end = " " * 4 +rest_end = " " * 10 + +print(first.ljust(w_first), end=first_end) +for cell, width in zip(rest, w_rest): +print(cell.rjust(width), end=rest_end) +print() + + +def print_section(heading: str) -> None: +"""Print a section heading.""" +sep = "=" * len(heading) +print(sep) +print(heading) +print(sep) +print() + + +def main() -> None: +"""Print CPU topology information.""" +sockets_s: T.Set[int] = set() +cores_s: T.Set[int] = set() +core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +base_path = "/sys/devices/system/cpu" + +cpus = range_expand(read_sysfs(f"{base_path}/online")) + +for cpu in cpus: +lcore_base = f"{base_path}/cpu{cpu}" +core = int(read_sysfs(f"{lcore_base}/topology/core_id")) +s
[PATCH v5 1/4] usertools/cpu_layout: update coding style
Update coding style: - make it PEP-484 compliant - format code with Ruff - address all mypy etc. warnings - use f-strings in place of old-style string interpolation - refactor printing to make the code more readable - read valid CPU ID's from "online" sysfs node Signed-off-by: Anatoly Burakov --- Notes: v4-v5: - Format with Ruff on default settings v3->v4: - Format with Ruff, line width 79 v1,v2 -> v3: - Import typing as T instead of individual types usertools/cpu_layout.py | 161 ++-- 1 file changed, 106 insertions(+), 55 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 891b9238fa..e133fb8ad3 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -3,62 +3,113 @@ # Copyright(c) 2010-2014 Intel Corporation # Copyright(c) 2017 Cavium, Inc. All rights reserved. -sockets = [] -cores = [] -core_map = {} -base_path = "/sys/devices/system/cpu" -fd = open("{}/kernel_max".format(base_path)) -max_cpus = int(fd.read()) -fd.close() -for cpu in range(max_cpus + 1): -try: -fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu)) -except IOError: -continue -core = int(fd.read()) -fd.close() -fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu)) -socket = int(fd.read()) -fd.close() -if core not in cores: -cores.append(core) -if socket not in sockets: -sockets.append(socket) -key = (socket, core) -if key not in core_map: -core_map[key] = [] -core_map[key].append(cpu) +"""Display CPU topology information.""" -print(format("=" * (47 + len(base_path -print("Core and Socket Information (as reported by '{}')".format(base_path)) -print("{}\n".format("=" * (47 + len(base_path -print("cores = ", cores) -print("sockets = ", sockets) -print("") +import typing as T -max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1)) -max_thread_count = len(list(core_map.values())[0]) -max_core_map_len = (max_processor_len * max_thread_count) \ - + len(", ") * (max_thread_count - 1) \ - + len('[]') + len('Socket ') -max_core_id_len = len(str(max(cores))) -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket ')) -print(output) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " ".ljust(max_core_map_len) -output += " " -print(output) - -for c in cores: -output = "Core %s" % str(c).ljust(max_core_id_len) -for s in sockets: -if (s, c) in core_map: -output += " " + str(core_map[(s, c)]).ljust(max_core_map_len) +def range_expand(rstr: str) -> T.List[int]: +"""Expand a range string into a list of integers.""" +# 0,1-3 => [0, 1-3] +ranges = rstr.split(",") +valset: T.List[int] = [] +for r in ranges: +# 1-3 => [1, 2, 3] +if "-" in r: +start, end = r.split("-") +valset.extend(range(int(start), int(end) + 1)) else: -output += " " * (max_core_map_len + 1) -print(output) +valset.append(int(r)) +return valset + + +def read_sysfs(path: str) -> str: +"""Read a sysfs file and return its contents.""" +with open(path, encoding="utf-8") as fd: +return fd.read().strip() + + +def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: +"""Print a row of a table with the given column widths.""" +first, *rest = row +w_first, *w_rest = col_widths +first_end = " " * 4 +rest_end = " " * 10 + +print(first.ljust(w_first), end=first_end) +for cell, width in zip(rest, w_rest): +print(cell.rjust(width), end=rest_end) +print() + + +def print_section(heading: str) -> None: +"""Print a section heading.""" +sep = "=" * len(heading) +print(sep) +print(heading) +print(sep) +print() + + +def main() -> None: +"""Print CPU topology information.""" +sockets_s: T.Set[int] = set() +cores_s: T.Set[int] = set() +core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +base_path = "/sys/devices/system/cpu" + +cpus = range_expand(read_sysfs(f"{base_path}/online")) + +for cpu in cpus: +lcore_base = f"{base_path}/cpu{cpu}" +
[PATCH v5 2/4] usertools/cpu_layout: print out NUMA nodes
In traditional NUMA case, NUMA nodes and physical sockets were used interchangeably, but there are cases where there can be multiple NUMA nodes per socket, as well as all CPU's being assigned NUMA node 0 even in cases of multiple sockets. Use sysfs to print out NUMA information. Signed-off-by: Anatoly Burakov --- Notes: v2 -> v3: - Sort imports alphabetically usertools/cpu_layout.py | 36 +++- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index e133fb8ad3..976be1f8b2 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -5,6 +5,7 @@ """Display CPU topology information.""" +import glob import typing as T @@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str: return fd.read().strip() +def read_numa_node(base: str) -> int: +"""Read the NUMA node of a CPU.""" +node_glob = f"{base}/node*" +node_dirs = glob.glob(node_glob) +if not node_dirs: +return 0 # default to node 0 +return int(node_dirs[0].split("node")[1]) + + def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: """Print a row of a table with the given column widths.""" first, *rest = row w_first, *w_rest = col_widths first_end = " " * 4 -rest_end = " " * 10 +rest_end = " " * 4 print(first.ljust(w_first), end=first_end) for cell, width in zip(rest, w_rest): @@ -56,6 +66,7 @@ def main() -> None: sockets_s: T.Set[int] = set() cores_s: T.Set[int] = set() core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +numa_map: T.Dict[int, int] = {} base_path = "/sys/devices/system/cpu" cpus = range_expand(read_sysfs(f"{base_path}/online")) @@ -64,12 +75,14 @@ def main() -> None: lcore_base = f"{base_path}/cpu{cpu}" core = int(read_sysfs(f"{lcore_base}/topology/core_id")) socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id")) +node = read_numa_node(lcore_base) cores_s.add(core) sockets_s.add(socket) key = (socket, core) core_map.setdefault(key, []) core_map[key].append(cpu) +numa_map[cpu] = node cores = sorted(cores_s) sockets = sorted(sockets_s) @@ -78,24 +91,37 @@ def main() -> None: print("cores = ", cores) print("sockets = ", sockets) +print("numa = ", sorted(set(numa_map.values( print() -# Core, [Socket, Socket, ...] -heading_strs = "", *[f"Socket {s}" for s in sockets] +# Core, [NUMA, Socket, NUMA, Socket, ...] +heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")] sep_strs = tuple("-" * len(hstr) for hstr in heading_strs) rows: T.List[T.Tuple[str, ...]] = [] +prev_numa = None for c in cores: # Core, row: T.Tuple[str, ...] = (f"Core {c}",) -# [lcores, lcores, ...] +# assume NUMA changes symmetrically +first_lcore = core_map[(0, c)][0] +cur_numa = numa_map[first_lcore] +numa_changed = prev_numa != cur_numa +prev_numa = cur_numa + +# [NUMA, lcores, NUMA, lcores, ...] for s in sockets: try: lcores = core_map[(s, c)] +numa = numa_map[lcores[0]] +if numa_changed: +row += (f"NUMA {numa}",) +else: +row += ("",) row += (str(lcores),) except KeyError: -row += ("",) +row += ("", "") rows += [row] # find max widths for each column, including header and rows -- 2.43.5
[PATCH v5 3/4] usertools/dpdk-hugepages.py: update coding style
Update coding style: - make the code PEP-484 compliant - add more comments, improve readability, use f-strings everywhere - address all Python static analysis (e.g. mypy, pylint) warnings - format code with Ruff - improve error handling - refactor printing and sysfs/procfs access functions - sort huge page reservation status output by NUMA node Signed-off-by: Anatoly Burakov Acked-by: Stephen Hemminger --- Notes: v4 -> v5: - Format with Ruff on default settings - Replaced all instances of raw path strings with os.path.join v3 -> v4: - Format code with Ruff, line width 79 to avoid flake8 warnings (Flake8 is by default configured with line width 79 on my system) v2 -> v3: - Rewrite of the script as suggested by reviewers v1 -> v2: - Added commit that sorted output by NUMA node usertools/dpdk-hugepages.py | 518 +--- 1 file changed, 310 insertions(+), 208 deletions(-) diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py index bf2575ba36..3fc3269c83 100755 --- a/usertools/dpdk-hugepages.py +++ b/usertools/dpdk-hugepages.py @@ -1,13 +1,15 @@ #! /usr/bin/env python3 # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2020 Microsoft Corporation + """Script to query and setup huge pages for DPDK applications.""" import argparse -import glob import os import re +import subprocess import sys +import typing as T from math import log2 # Standard binary prefix @@ -15,194 +17,266 @@ # systemd mount point for huge pages HUGE_MOUNT = "/dev/hugepages" +# default directory for non-NUMA huge pages +NO_NUMA_HUGE_DIR = "/sys/kernel/mm/hugepages" +# default base directory for NUMA nodes +NUMA_NODE_BASE_DIR = "/sys/devices/system/node" +# procfs paths +MEMINFO_PATH = "/proc/meminfo" +MOUNTS_PATH = "/proc/mounts" -def fmt_memsize(kb): -'''Format memory size in kB into conventional format''' +class HugepageMount: +"""Mount operations for huge page filesystem.""" + +def __init__(self, path: str, mounted: bool): +self.path = path +# current mount status +self.mounted = mounted + +def mount( +self, pagesize_kb: int, user: T.Optional[str], group: T.Optional[str] +) -> None: +"""Mount the huge TLB file system""" +if self.mounted: +return +cmd = ["mount", "-t", "hugetlbfs"] +cmd += ["-o", f"pagesize={pagesize_kb * 1024}"] +if user is not None: +cmd += ["-o", f"uid={user}"] +if group is not None: +cmd += ["-o", f"gid={group}"] +cmd += ["nodev", self.path] + +subprocess.run(cmd, check=True) +self.mounted = True + +def unmount(self) -> None: +"""Unmount the huge TLB file system (if mounted)""" +if self.mounted: +subprocess.run(["umount", self.path], check=True) +self.mounted = False + + +class HugepageRes: +"""Huge page reserve operations. Can be NUMA-node-specific.""" + +def __init__(self, path: str, node: T.Optional[int] = None): +self.path = path +# if this is a per-NUMA node huge page dir, store the node number +self.node = node +self.valid_page_sizes = self._get_valid_page_sizes() + +def _get_valid_page_sizes(self) -> T.List[int]: +"""Extract valid huge page sizes""" +return [get_memsize(d.split("-")[1]) for d in os.listdir(self.path)] + +def _nr_pages_path(self, sz: int) -> str: +if sz not in self.valid_page_sizes: +raise ValueError( +f"Invalid page size {sz}. " f"Valid sizes: {self.valid_page_sizes}" +) +return os.path.join(self.path, f"hugepages-{sz}kB", "nr_hugepages") + +def __getitem__(self, sz: int) -> int: +"""Get current number of reserved pages of specified size""" +with open(self._nr_pages_path(sz), encoding="utf-8") as f: +return int(f.read()) + +def __setitem__(self, sz: int, nr_pages: int) -> None: +"""Set number of reserved pages of specified size""" +with open(self._nr_pages_path(sz), "w", encoding="utf-8") as f: +f.write(f"{nr_pages}\n") + + +def fmt_memsize(kb: int) -> str: +"""Format memory size in kB into conventional format""" logk = int(log2(kb) / 10) suffix = BINARY_PREFIX[logk] -unit = 2**(logk * 10) -re
[PATCH v5 4/4] usertools/dpdk-devbind: print NUMA node
Currently, devbind does not print out any NUMA information, which makes figuring out which NUMA node device belongs to not trivial. Add printouts for NUMA information if NUMA support is enabled on the system. Signed-off-by: Anatoly Burakov Acked-by: Robin Jarry --- Notes: v1 -> v2: - Added commit to print out NUMA information in devbind usertools/dpdk-devbind.py | 29 + 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py index b276e8efc8..078e8c387b 100755 --- a/usertools/dpdk-devbind.py +++ b/usertools/dpdk-devbind.py @@ -110,6 +110,11 @@ args = [] +# check if this system has NUMA support +def is_numa(): +return os.path.exists('/sys/devices/system/node') + + # check if a specific kernel module is loaded def module_is_loaded(module): global loaded_modules @@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, if_field=False): print("".join('=' * len(msg))) return +print_numa = is_numa() + # print each category separately, so we can clearly see what's used by DPDK if dpdk_drv: +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param display_devices("%s devices using DPDK-compatible driver" % device_name, -dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s") +dpdk_drv, extra_param) if kernel_drv: -if_text = "" +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" if if_field: -if_text = "if=%(Interface)s " -display_devices("%s devices using kernel driver" % device_name, kernel_drv, -if_text + "drv=%(Driver_str)s " -"unused=%(Module_str)s %(Active)s") +extra_param = "if=%(Interface)s " + extra_param +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("%s devices using kernel driver" % device_name, +kernel_drv, extra_param) if no_drv: -display_devices("Other %s devices" % device_name, no_drv, -"unused=%(Module_str)s") +extra_param = "unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("Other %s devices" % device_name, no_drv, extra_param) def show_status(): -- 2.43.5
[PATCH v6 1/4] usertools/cpu_layout: update coding style
Update coding style: - make it PEP-484 compliant - format code with Ruff - address all mypy etc. warnings - use f-strings in place of old-style string interpolation - refactor printing to make the code more readable - read valid CPU ID's from "online" sysfs node Signed-off-by: Anatoly Burakov Acked-by: Robin Jarry --- Notes: v4-v5: - Format with Ruff on default settings v3->v4: - Format with Ruff, line width 79 v1,v2 -> v3: - Import typing as T instead of individual types usertools/cpu_layout.py | 161 ++-- 1 file changed, 106 insertions(+), 55 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index 891b9238fa..e133fb8ad3 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -3,62 +3,113 @@ # Copyright(c) 2010-2014 Intel Corporation # Copyright(c) 2017 Cavium, Inc. All rights reserved. -sockets = [] -cores = [] -core_map = {} -base_path = "/sys/devices/system/cpu" -fd = open("{}/kernel_max".format(base_path)) -max_cpus = int(fd.read()) -fd.close() -for cpu in range(max_cpus + 1): -try: -fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu)) -except IOError: -continue -core = int(fd.read()) -fd.close() -fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu)) -socket = int(fd.read()) -fd.close() -if core not in cores: -cores.append(core) -if socket not in sockets: -sockets.append(socket) -key = (socket, core) -if key not in core_map: -core_map[key] = [] -core_map[key].append(cpu) +"""Display CPU topology information.""" -print(format("=" * (47 + len(base_path -print("Core and Socket Information (as reported by '{}')".format(base_path)) -print("{}\n".format("=" * (47 + len(base_path -print("cores = ", cores) -print("sockets = ", sockets) -print("") +import typing as T -max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1)) -max_thread_count = len(list(core_map.values())[0]) -max_core_map_len = (max_processor_len * max_thread_count) \ - + len(", ") * (max_thread_count - 1) \ - + len('[]') + len('Socket ') -max_core_id_len = len(str(max(cores))) -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket ')) -print(output) - -output = " ".ljust(max_core_id_len + len('Core ')) -for s in sockets: -output += " ".ljust(max_core_map_len) -output += " " -print(output) - -for c in cores: -output = "Core %s" % str(c).ljust(max_core_id_len) -for s in sockets: -if (s, c) in core_map: -output += " " + str(core_map[(s, c)]).ljust(max_core_map_len) +def range_expand(rstr: str) -> T.List[int]: +"""Expand a range string into a list of integers.""" +# 0,1-3 => [0, 1-3] +ranges = rstr.split(",") +valset: T.List[int] = [] +for r in ranges: +# 1-3 => [1, 2, 3] +if "-" in r: +start, end = r.split("-") +valset.extend(range(int(start), int(end) + 1)) else: -output += " " * (max_core_map_len + 1) -print(output) +valset.append(int(r)) +return valset + + +def read_sysfs(path: str) -> str: +"""Read a sysfs file and return its contents.""" +with open(path, encoding="utf-8") as fd: +return fd.read().strip() + + +def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: +"""Print a row of a table with the given column widths.""" +first, *rest = row +w_first, *w_rest = col_widths +first_end = " " * 4 +rest_end = " " * 10 + +print(first.ljust(w_first), end=first_end) +for cell, width in zip(rest, w_rest): +print(cell.rjust(width), end=rest_end) +print() + + +def print_section(heading: str) -> None: +"""Print a section heading.""" +sep = "=" * len(heading) +print(sep) +print(heading) +print(sep) +print() + + +def main() -> None: +"""Print CPU topology information.""" +sockets_s: T.Set[int] = set() +cores_s: T.Set[int] = set() +core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +base_path = "/sys/devices/system/cpu" + +cpus = range_expand(read_sysfs(f"{base_path}/online")) + +for cpu in cpus: +lcore_base = f"{base_path}/
[PATCH v6 2/4] usertools/cpu_layout: print out NUMA nodes
In traditional NUMA case, NUMA nodes and physical sockets were used interchangeably, but there are cases where there can be multiple NUMA nodes per socket, as well as all CPU's being assigned NUMA node 0 even in cases of multiple sockets. Use sysfs to print out NUMA information. Signed-off-by: Anatoly Burakov --- Notes: v5 -> v6: - Track NUMA changes per socket to avoid issues with missing cores v2 -> v3: - Sort imports alphabetically usertools/cpu_layout.py | 35 ++- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py index e133fb8ad3..5972cfecdb 100755 --- a/usertools/cpu_layout.py +++ b/usertools/cpu_layout.py @@ -5,6 +5,7 @@ """Display CPU topology information.""" +import glob import typing as T @@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str: return fd.read().strip() +def read_numa_node(base: str) -> int: +"""Read the NUMA node of a CPU.""" +node_glob = f"{base}/node*" +node_dirs = glob.glob(node_glob) +if not node_dirs: +return 0 # default to node 0 +return int(node_dirs[0].split("node")[1]) + + def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None: """Print a row of a table with the given column widths.""" first, *rest = row w_first, *w_rest = col_widths first_end = " " * 4 -rest_end = " " * 10 +rest_end = " " * 4 print(first.ljust(w_first), end=first_end) for cell, width in zip(rest, w_rest): @@ -56,6 +66,7 @@ def main() -> None: sockets_s: T.Set[int] = set() cores_s: T.Set[int] = set() core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {} +numa_map: T.Dict[int, int] = {} base_path = "/sys/devices/system/cpu" cpus = range_expand(read_sysfs(f"{base_path}/online")) @@ -64,12 +75,14 @@ def main() -> None: lcore_base = f"{base_path}/cpu{cpu}" core = int(read_sysfs(f"{lcore_base}/topology/core_id")) socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id")) +node = read_numa_node(lcore_base) cores_s.add(core) sockets_s.add(socket) key = (socket, core) core_map.setdefault(key, []) core_map[key].append(cpu) +numa_map[cpu] = node cores = sorted(cores_s) sockets = sorted(sockets_s) @@ -78,24 +91,36 @@ def main() -> None: print("cores = ", cores) print("sockets = ", sockets) +print("numa = ", sorted(set(numa_map.values( print() -# Core, [Socket, Socket, ...] -heading_strs = "", *[f"Socket {s}" for s in sockets] +# Core, [NUMA, Socket, NUMA, Socket, ...] +heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")] sep_strs = tuple("-" * len(hstr) for hstr in heading_strs) rows: T.List[T.Tuple[str, ...]] = [] +# track NUMA changes per socket +prev_numa: T.Dict[int, T.Optional[int]] = {socket: None for socket in sockets} for c in cores: # Core, row: T.Tuple[str, ...] = (f"Core {c}",) -# [lcores, lcores, ...] +# [NUMA, lcores, NUMA, lcores, ...] for s in sockets: try: lcores = core_map[(s, c)] + +numa = numa_map[lcores[0]] +numa_changed = prev_numa[s] != numa +prev_numa[s] = numa + +if numa_changed: +row += (f"NUMA {numa}",) +else: +row += ("",) row += (str(lcores),) except KeyError: -row += ("",) +row += ("", "") rows += [row] # find max widths for each column, including header and rows -- 2.43.5
[PATCH v6 3/4] usertools/dpdk-hugepages.py: update coding style
Update coding style: - make the code PEP-484 compliant - add more comments, improve readability, use f-strings everywhere - address all Python static analysis (e.g. mypy, pylint) warnings - format code with Ruff - improve error handling - refactor printing and sysfs/procfs access functions - sort huge page reservation status output by NUMA node Signed-off-by: Anatoly Burakov Acked-by: Stephen Hemminger Acked-by: Robin Jarry --- Notes: v4 -> v5: - Format with Ruff on default settings - Replaced all instances of raw path strings with os.path.join v3 -> v4: - Format code with Ruff, line width 79 to avoid flake8 warnings (Flake8 is by default configured with line width 79 on my system) v2 -> v3: - Rewrite of the script as suggested by reviewers v1 -> v2: - Added commit that sorted output by NUMA node usertools/dpdk-hugepages.py | 518 +--- 1 file changed, 310 insertions(+), 208 deletions(-) diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py index bf2575ba36..3fc3269c83 100755 --- a/usertools/dpdk-hugepages.py +++ b/usertools/dpdk-hugepages.py @@ -1,13 +1,15 @@ #! /usr/bin/env python3 # SPDX-License-Identifier: BSD-3-Clause # Copyright (c) 2020 Microsoft Corporation + """Script to query and setup huge pages for DPDK applications.""" import argparse -import glob import os import re +import subprocess import sys +import typing as T from math import log2 # Standard binary prefix @@ -15,194 +17,266 @@ # systemd mount point for huge pages HUGE_MOUNT = "/dev/hugepages" +# default directory for non-NUMA huge pages +NO_NUMA_HUGE_DIR = "/sys/kernel/mm/hugepages" +# default base directory for NUMA nodes +NUMA_NODE_BASE_DIR = "/sys/devices/system/node" +# procfs paths +MEMINFO_PATH = "/proc/meminfo" +MOUNTS_PATH = "/proc/mounts" -def fmt_memsize(kb): -'''Format memory size in kB into conventional format''' +class HugepageMount: +"""Mount operations for huge page filesystem.""" + +def __init__(self, path: str, mounted: bool): +self.path = path +# current mount status +self.mounted = mounted + +def mount( +self, pagesize_kb: int, user: T.Optional[str], group: T.Optional[str] +) -> None: +"""Mount the huge TLB file system""" +if self.mounted: +return +cmd = ["mount", "-t", "hugetlbfs"] +cmd += ["-o", f"pagesize={pagesize_kb * 1024}"] +if user is not None: +cmd += ["-o", f"uid={user}"] +if group is not None: +cmd += ["-o", f"gid={group}"] +cmd += ["nodev", self.path] + +subprocess.run(cmd, check=True) +self.mounted = True + +def unmount(self) -> None: +"""Unmount the huge TLB file system (if mounted)""" +if self.mounted: +subprocess.run(["umount", self.path], check=True) +self.mounted = False + + +class HugepageRes: +"""Huge page reserve operations. Can be NUMA-node-specific.""" + +def __init__(self, path: str, node: T.Optional[int] = None): +self.path = path +# if this is a per-NUMA node huge page dir, store the node number +self.node = node +self.valid_page_sizes = self._get_valid_page_sizes() + +def _get_valid_page_sizes(self) -> T.List[int]: +"""Extract valid huge page sizes""" +return [get_memsize(d.split("-")[1]) for d in os.listdir(self.path)] + +def _nr_pages_path(self, sz: int) -> str: +if sz not in self.valid_page_sizes: +raise ValueError( +f"Invalid page size {sz}. " f"Valid sizes: {self.valid_page_sizes}" +) +return os.path.join(self.path, f"hugepages-{sz}kB", "nr_hugepages") + +def __getitem__(self, sz: int) -> int: +"""Get current number of reserved pages of specified size""" +with open(self._nr_pages_path(sz), encoding="utf-8") as f: +return int(f.read()) + +def __setitem__(self, sz: int, nr_pages: int) -> None: +"""Set number of reserved pages of specified size""" +with open(self._nr_pages_path(sz), "w", encoding="utf-8") as f: +f.write(f"{nr_pages}\n") + + +def fmt_memsize(kb: int) -> str: +"""Format memory size in kB into conventional format""" logk = int(log2(kb) / 10) suffix = BINARY_PREFIX[logk] -unit = 2**(l
[PATCH v6 4/4] usertools/dpdk-devbind: print NUMA node
Currently, devbind does not print out any NUMA information, which makes figuring out which NUMA node device belongs to not trivial. Add printouts for NUMA information if NUMA support is enabled on the system. Signed-off-by: Anatoly Burakov Acked-by: Robin Jarry --- Notes: v1 -> v2: - Added commit to print out NUMA information in devbind usertools/dpdk-devbind.py | 29 + 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py index b276e8efc8..078e8c387b 100755 --- a/usertools/dpdk-devbind.py +++ b/usertools/dpdk-devbind.py @@ -110,6 +110,11 @@ args = [] +# check if this system has NUMA support +def is_numa(): +return os.path.exists('/sys/devices/system/node') + + # check if a specific kernel module is loaded def module_is_loaded(module): global loaded_modules @@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, if_field=False): print("".join('=' * len(msg))) return +print_numa = is_numa() + # print each category separately, so we can clearly see what's used by DPDK if dpdk_drv: +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param display_devices("%s devices using DPDK-compatible driver" % device_name, -dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s") +dpdk_drv, extra_param) if kernel_drv: -if_text = "" +extra_param = "drv=%(Driver_str)s unused=%(Module_str)s" if if_field: -if_text = "if=%(Interface)s " -display_devices("%s devices using kernel driver" % device_name, kernel_drv, -if_text + "drv=%(Driver_str)s " -"unused=%(Module_str)s %(Active)s") +extra_param = "if=%(Interface)s " + extra_param +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("%s devices using kernel driver" % device_name, +kernel_drv, extra_param) if no_drv: -display_devices("Other %s devices" % device_name, no_drv, -"unused=%(Module_str)s") +extra_param = "unused=%(Module_str)s" +if print_numa: +extra_param = "numa_node=%(NUMANode)s " + extra_param +display_devices("Other %s devices" % device_name, no_drv, extra_param) def show_status(): -- 2.43.5
[PATCH v1 1/3] fbarray: rename tests to be more meaningful
Some tests reference internal implementation details of fbarray, as well as equivocate between mask and index. Most other test names are not very descriptive. Rename them, and adjust comments to explain things in terms of what the tests actually do, instead of referring to internal implementation details. Also, add more tests that fill up exactly one mask, with and without neighbouring set bits. Signed-off-by: Anatoly Burakov --- app/test/test_fbarray.c | 99 - 1 file changed, 67 insertions(+), 32 deletions(-) diff --git a/app/test/test_fbarray.c b/app/test/test_fbarray.c index 09f6907fb1..6ca509b898 100644 --- a/app/test/test_fbarray.c +++ b/app/test/test_fbarray.c @@ -104,7 +104,7 @@ static int first_msk_test_setup(void) return init_aligned(); } -static int cross_msk_test_setup(void) +static int contig_test_setup(void) { /* put all within second and third mask */ param.start = 70; @@ -112,7 +112,7 @@ static int cross_msk_test_setup(void) return init_aligned(); } -static int multi_msk_test_setup(void) +static int large_contig_test_setup(void) { /* put all within first and last mask */ param.start = 3; @@ -128,15 +128,39 @@ static int last_msk_test_setup(void) return init_aligned(); } +static int full_index_test_setup(void) +{ + /* fill entire index */ + param.start = 0; + param.end = FBARRAY_TEST_LEN - 1; + return init_aligned(); +} + static int full_msk_test_setup(void) { - /* fill entire mask */ + /* fill one mask */ param.start = 0; - param.end = FBARRAY_TEST_LEN - 1; + param.end = 63; return init_aligned(); } -static int lookahead_test_setup(void) +static int full_msk_contig_fwd_test_setup(void) +{ + /* fill one mask plus one item */ + param.start = 64; + param.end = 128; + return init_aligned(); +} + +static int full_msk_contig_rev_test_setup(void) +{ + /* fill one mask plus one item */ + param.start = 63; + param.end = 127; + return init_aligned(); +} + +static int cross_msk_test_setup(void) { /* set index 64 as used */ param.start = 64; @@ -144,7 +168,7 @@ static int lookahead_test_setup(void) return init_aligned(); } -static int lookbehind_test_setup(void) +static int cross_msk_rev_test_setup(void) { /* set index 63 as used */ param.start = 63; @@ -160,6 +184,13 @@ static int unaligned_test_setup(void) return init_unaligned(); } +static int full_unaligned_test_setup(void) +{ + unaligned.start = 0; + unaligned.end = FBARRAY_UNALIGNED_TEST_LEN - 1; + return init_unaligned(); +} + static int test_invalid(void) { struct rte_fbarray dummy; @@ -786,7 +817,7 @@ static int test_empty(void) return TEST_SUCCESS; } -static int test_lookahead(void) +static int test_cross_msk(void) { int ret; @@ -801,7 +832,7 @@ static int test_lookahead(void) return TEST_SUCCESS; } -static int test_lookbehind(void) +static int test_cross_rev_msk(void) { int ret, free_len = 2; @@ -816,19 +847,19 @@ static int test_lookbehind(void) return TEST_SUCCESS; } -static int test_lookahead_mask(void) +static int test_broken_run(void) { /* -* There is a certain type of lookahead behavior we want to test here, -* namely masking of bits that were scanned with lookahead but that we -* know do not match our criteria. This is achieved in following steps: +* There is a certain type of search behavior we want to test here, +* namely starting cross-mask runs and failing to find them. This is +* achieved when these conditions happen: * * 0. Look for a big enough chunk of free space (say, 62 elements) -* 1. Trigger lookahead by breaking a run somewhere inside mask 0 -* (indices 0-63) -* 2. Fail lookahead by breaking the run somewhere inside mask 1 -* (indices 64-127) -* 3. Ensure that we can still find free space in mask 1 afterwards +* 1. Break a run somewhere inside mask 0 (indices 0-63) but leave +* some free elements at the end of mask 0 to start a run +* 2. Break the run somewhere inside mask 1 (indices 64-127) +* 3. Ensure that we can still find a free space run right after the +* second broken run */ /* break run on first mask */ @@ -842,19 +873,19 @@ static int test_lookahead_mask(void) return TEST_SUCCESS; } -static int test_lookbehind_mask(void) +static int test_rev_broken_run(void) { /* -* There is a certain type of lookbehind behavior we want to test here, -* namely masking of bits that were scanned with lookbehind but that we -* know do not match our criteria. This is achieved in two steps: +* There is a certain type
[PATCH v1 2/3] fbarray: rework find_next_n to flatten the loop
Currently, find_next_n() is implemented as a nested loop due to lookahead functionality. This is not very efficient because when doing lookahead, we essentially scan some of the indices twice, and in general the lookahead functionality has been a source of bugs because it is overcomplicated. The bit ignore feature on lookahead is also unnecessary because we don't really win anything by ignoring bits we have already scanned, as they would not trigger any matches anyway. This patch reworks find_next_n() to flatten the loop and remove the lookahead and bit-ignore functionality, instead replacing it with state machine-like behavior. This makes the code simpler to reason about. Signed-off-by: Anatoly Burakov --- lib/eal/common/eal_common_fbarray.c | 195 +--- 1 file changed, 93 insertions(+), 102 deletions(-) diff --git a/lib/eal/common/eal_common_fbarray.c b/lib/eal/common/eal_common_fbarray.c index 22b43073c6..d38a43d8d1 100644 --- a/lib/eal/common/eal_common_fbarray.c +++ b/lib/eal/common/eal_common_fbarray.c @@ -117,9 +117,11 @@ find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, { const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, arr->len); - unsigned int msk_idx, lookahead_idx, first, first_mod; + unsigned int msk_idx, first, first_mod; unsigned int last, last_mod; - uint64_t last_msk, ignore_msk; + uint64_t last_msk, first_msk; + unsigned int run_start, left = 0; + bool run_started = false; /* * mask only has granularity of MASK_ALIGN, but start may not be aligned @@ -128,7 +130,7 @@ find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, */ first = MASK_LEN_TO_IDX(start); first_mod = MASK_LEN_TO_MOD(start); - ignore_msk = ~((1ULL << first_mod) - 1); + first_msk = ~((1ULL << first_mod) - 1); /* array length may not be aligned, so calculate ignore mask for last * mask index. @@ -137,131 +139,120 @@ find_next_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, last_mod = MASK_LEN_TO_MOD(arr->len); last_msk = ~(UINT64_MAX << last_mod); + left = n; + for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) { - uint64_t cur_msk, lookahead_msk; - unsigned int run_start, clz, left; - bool found = false; + unsigned int s_idx, clz, need; + uint64_t cur_msk, tmp_msk; + /* -* The process of getting n consecutive bits for arbitrary n is -* a bit involved, but here it is in a nutshell: +* In order to find N consecutive bits for arbitrary N, we need +* to be aware of the following: * -* 1. let n be the number of consecutive bits we're looking for -* 2. check if n can fit in one mask, and if so, do n-1 -* rshift-ands to see if there is an appropriate run inside -* our current mask -*2a. if we found a run, bail out early -*2b. if we didn't find a run, proceed -* 3. invert the mask and count leading zeroes (that is, count -* how many consecutive set bits we had starting from the -* end of current mask) as k -*3a. if k is 0, continue to next mask -*3b. if k is not 0, we have a potential run -* 4. to satisfy our requirements, next mask must have n-k -* consecutive set bits right at the start, so we will do -* (n-k-1) rshift-ands and check if first bit is set. +* 1. To find N number of consecutive bits within a mask, we +* need to do N-1 rshift-ands and see if we still have set +* bits anywhere in the mask +* 2. N may be larger than mask size, in which case we need to +* do a search in multiple consecutive masks +* 3. For multi-mask search to be meaningful, we need to anchor +* our searches, i.e. first we find a run of M bits at the +* end of current mask, then we look for N-M bits at the +* beginning of next mask (or multiple masks) * -* Step 4 will need to be repeated if (n-k) > MASK_ALIGN until -* we either run out of masks, lose the run, or find what we -* were looking for. +* With all of the above, the algorihm looks as follows: +* +* 1. let N be the number of consecutive bits we're looking for +* 2. if we already started a run, check
[PATCH v1 3/3] fbarray: rework find_prev_n to flatten the loop
Currently, find_prev_n() is implemented as a nested loop due to lookbehind functionality. This is not very efficient because when doing lookbehind, we essentially scan some of the indices twice, and in general the lookbehind functionality has been a source of bugs because it is overcomplicated. The bit ignore feature on lookbehind is also unnecessary because we don't really win anything by ignoring bits we have already scanned, as they would not trigger any matches anyway. This patch reworks find_prev_n() to flatten the loop and remove the lookbehind and bit-ignore functionality, instead replacing it with state machine-like behavior. This makes the code simpler to reason about. Signed-off-by: Anatoly Burakov --- lib/eal/common/eal_common_fbarray.c | 223 +--- 1 file changed, 101 insertions(+), 122 deletions(-) diff --git a/lib/eal/common/eal_common_fbarray.c b/lib/eal/common/eal_common_fbarray.c index d38a43d8d1..a2a19af14a 100644 --- a/lib/eal/common/eal_common_fbarray.c +++ b/lib/eal/common/eal_common_fbarray.c @@ -382,164 +382,143 @@ find_prev_n(const struct rte_fbarray *arr, unsigned int start, unsigned int n, { const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz, arr->len); - unsigned int msk_idx, lookbehind_idx, first, first_mod; - uint64_t ignore_msk; + /* we're going backwards so we need negative space */ + int64_t msk_idx; + unsigned int first, first_mod; + uint64_t first_msk; + unsigned int run_end, left; + bool run_started = false; /* * mask only has granularity of MASK_ALIGN, but start may not be aligned * on that boundary, so construct a special mask to exclude anything we -* don't want to see to avoid confusing ctz. +* don't want to see to avoid confusing clz. this "first" mask is +* actually our last because we're going backwards, so no second mask +* is required like in find_next_n case. */ first = MASK_LEN_TO_IDX(start); first_mod = MASK_LEN_TO_MOD(start); /* we're going backwards, so mask must start from the top */ - ignore_msk = first_mod == MASK_ALIGN - 1 ? + first_msk = first_mod == MASK_ALIGN - 1 ? UINT64_MAX : /* prevent overflow */ ~(UINT64_MAX << (first_mod + 1)); + left = n; + /* go backwards, include zero */ - msk_idx = first; - do { - uint64_t cur_msk, lookbehind_msk; - unsigned int run_start, run_end, ctz, left; - bool found = false; + for (msk_idx = first; msk_idx >= 0; msk_idx--) { + unsigned int s_idx, ctz, need; + uint64_t cur_msk, tmp_msk; + /* -* The process of getting n consecutive bits from the top for -* arbitrary n is a bit involved, but here it is in a nutshell: +* In order to find N consecutive bits for arbitrary N, we need +* to be aware of the following: * -* 1. let n be the number of consecutive bits we're looking for -* 2. check if n can fit in one mask, and if so, do n-1 -* lshift-ands to see if there is an appropriate run inside -* our current mask -*2a. if we found a run, bail out early -*2b. if we didn't find a run, proceed -* 3. invert the mask and count trailing zeroes (that is, count -* how many consecutive set bits we had starting from the -* start of current mask) as k -*3a. if k is 0, continue to next mask -*3b. if k is not 0, we have a potential run -* 4. to satisfy our requirements, next mask must have n-k -* consecutive set bits at the end, so we will do (n-k-1) -* lshift-ands and check if last bit is set. +* 1. To find N number of consecutive bits within a mask, we +* need to do N-1 lshift-ands and see if we still have set +* bits anywhere in the mask +* 2. N may be larger than mask size, in which case we need to +* do a search in multiple consecutive masks +* 3. For multi-mask search to be meaningful, we need to anchor +* our searches, i.e. first we find a run of M bits at the +* beginning of current mask, then we look for N-M bits at +* the end of previous mask (or multiple masks) * -* Step 4 will need to be repeated if (n-k) > MASK_ALIGN until -* we either run out of masks, lose the run, or find what we
[PATCH v1 01/15] net/ixgbe/base: remove minsrevs code from DPDK
Commit add44414762c ("net/ixgbe/base: add E610 NVM-related operations") has added code to read minimum security revision from NVM. This code was not meant to be included in DPDK, and was only meant for other drivers derived from shared base code, but was present due to the way shared driver code snapshot was generated. Remove this code from DPDK driver. Fixes: add44414762c ("net/ixgbe/base: add E610 NVM-related operations") Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_e610.c | 54 - drivers/net/ixgbe/base/ixgbe_e610.h | 1 - 2 files changed, 55 deletions(-) diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c b/drivers/net/ixgbe/base/ixgbe_e610.c index ac71980630..3e2be07731 100644 --- a/drivers/net/ixgbe/base/ixgbe_e610.c +++ b/drivers/net/ixgbe/base/ixgbe_e610.c @@ -2488,60 +2488,6 @@ static s32 ixgbe_read_nvm_sr_copy(struct ixgbe_hw *hw, return ixgbe_read_nvm_module(hw, bank, hdr_len + offset, data); } -/** - * ixgbe_get_nvm_minsrevs - Get the minsrevs values from flash - * @hw: pointer to the HW struct - * @minsrevs: structure to store NVM and OROM minsrev values - * - * Read the Minimum Security Revision TLV and extract - * the revision values from the flash image - * into a readable structure for processing. - * - * Return: the exit code of the operation. - */ -s32 ixgbe_get_nvm_minsrevs(struct ixgbe_hw *hw, - struct ixgbe_minsrev_info *minsrevs) -{ - struct ixgbe_aci_cmd_nvm_minsrev data; - s32 status; - u16 valid; - - status = ixgbe_acquire_nvm(hw, IXGBE_RES_READ); - if (status) - return status; - - status = ixgbe_aci_read_nvm(hw, IXGBE_ACI_NVM_MINSREV_MOD_ID, - 0, sizeof(data), &data, - true, false); - - ixgbe_release_nvm(hw); - - if (status) - return status; - - valid = IXGBE_LE16_TO_CPU(data.validity); - - /* Extract NVM minimum security revision */ - if (valid & IXGBE_ACI_NVM_MINSREV_NVM_VALID) { - u16 minsrev_l = IXGBE_LE16_TO_CPU(data.nvm_minsrev_l); - u16 minsrev_h = IXGBE_LE16_TO_CPU(data.nvm_minsrev_h); - - minsrevs->nvm = minsrev_h << 16 | minsrev_l; - minsrevs->nvm_valid = true; - } - - /* Extract the OROM minimum security revision */ - if (valid & IXGBE_ACI_NVM_MINSREV_OROM_VALID) { - u16 minsrev_l = IXGBE_LE16_TO_CPU(data.orom_minsrev_l); - u16 minsrev_h = IXGBE_LE16_TO_CPU(data.orom_minsrev_h); - - minsrevs->orom = minsrev_h << 16 | minsrev_l; - minsrevs->orom_valid = true; - } - - return IXGBE_SUCCESS; -} - /** * ixgbe_get_nvm_srev - Read the security revision from the NVM CSS header * @hw: pointer to the HW struct diff --git a/drivers/net/ixgbe/base/ixgbe_e610.h b/drivers/net/ixgbe/base/ixgbe_e610.h index 33c683d1c1..4babee821e 100644 --- a/drivers/net/ixgbe/base/ixgbe_e610.h +++ b/drivers/net/ixgbe/base/ixgbe_e610.h @@ -85,7 +85,6 @@ s32 ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 module_typeid, u32 offset, s32 ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw); s32 ixgbe_nvm_recalculate_checksum(struct ixgbe_hw *hw); -s32 ixgbe_get_nvm_minsrevs(struct ixgbe_hw *hw, struct ixgbe_minsrev_info *minsrevs); s32 ixgbe_get_inactive_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm); s32 ixgbe_get_active_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm); s32 ixgbe_init_nvm(struct ixgbe_hw *hw); -- 2.43.5
[PATCH v1 02/15] net/ixgbe/base: add missing ACI definitions
When adding Admin Command Interface and E610 device support, some ACI capabilities definition code was missed due to the way shared driver code snapshot was generated. Add missing code paths. Fixes: 25b48e569f2f ("net/ixgbe/base: add E610 Admin Command Interface") Fixes: 7c3bfffda43d ("net/ixgbe/base: detect E610 device capabilities") Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_e610.c | 49 drivers/net/ixgbe/base/ixgbe_type_e610.h | 4 ++ 2 files changed, 53 insertions(+) diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c b/drivers/net/ixgbe/base/ixgbe_e610.c index 3e2be07731..1f3a4532da 100644 --- a/drivers/net/ixgbe/base/ixgbe_e610.c +++ b/drivers/net/ixgbe/base/ixgbe_e610.c @@ -671,6 +671,9 @@ ixgbe_parse_common_caps(struct ixgbe_hw *hw, struct ixgbe_hw_common_caps *caps, case IXGBE_ACI_CAPS_VALID_FUNCTIONS: caps->valid_functions = number; break; + case IXGBE_ACI_CAPS_SRIOV: + caps->sr_iov_1_1 = (number == 1); + break; case IXGBE_ACI_CAPS_VMDQ: caps->vmdq = (number == 1); break; @@ -833,6 +836,25 @@ ixgbe_parse_valid_functions_cap(struct ixgbe_hw *hw, hw->logical_pf_id = ixgbe_func_id_to_logical_id(number, hw->pf_id); } +/** + * ixgbe_parse_vf_dev_caps - Parse IXGBE_ACI_CAPS_VF device caps + * @hw: pointer to the HW struct + * @dev_p: pointer to device capabilities structure + * @cap: capability element to parse + * + * Parse IXGBE_ACI_CAPS_VF for device capabilities. + */ +static void ixgbe_parse_vf_dev_caps(struct ixgbe_hw *hw, + struct ixgbe_hw_dev_caps *dev_p, + struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + u32 number = IXGBE_LE32_TO_CPU(cap->number); + + UNREFERENCED_1PARAMETER(hw); + + dev_p->num_vfs_exposed = number; +} + /** * ixgbe_parse_vsi_dev_caps - Parse IXGBE_ACI_CAPS_VSI device caps * @hw: pointer to the HW struct @@ -944,6 +966,9 @@ static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw, ixgbe_parse_valid_functions_cap(hw, dev_p, &cap_resp[i]); break; + case IXGBE_ACI_CAPS_VF: + ixgbe_parse_vf_dev_caps(hw, dev_p, &cap_resp[i]); + break; case IXGBE_ACI_CAPS_VSI: ixgbe_parse_vsi_dev_caps(hw, dev_p, &cap_resp[i]); break; @@ -962,6 +987,27 @@ static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw, } +/** + * ixgbe_parse_vf_func_caps - Parse IXGBE_ACI_CAPS_VF function caps + * @hw: pointer to the HW struct + * @func_p: pointer to function capabilities structure + * @cap: pointer to the capability element to parse + * + * Extract function capabilities for IXGBE_ACI_CAPS_VF. + */ +static void ixgbe_parse_vf_func_caps(struct ixgbe_hw *hw, +struct ixgbe_hw_func_caps *func_p, +struct ixgbe_aci_cmd_list_caps_elem *cap) +{ + u32 logical_id = IXGBE_LE32_TO_CPU(cap->logical_id); + u32 number = IXGBE_LE32_TO_CPU(cap->number); + + UNREFERENCED_1PARAMETER(hw); + + func_p->num_allocd_vfs = number; + func_p->vf_base_id = logical_id; +} + /** * ixgbe_get_num_per_func - determine number of resources per PF * @hw: pointer to the HW structure @@ -1073,6 +1119,9 @@ static void ixgbe_parse_func_caps(struct ixgbe_hw *hw, &cap_resp[i], "func caps"); switch (cap) { + case IXGBE_ACI_CAPS_VF: + ixgbe_parse_vf_func_caps(hw, func_p, &cap_resp[i]); + break; case IXGBE_ACI_CAPS_VSI: ixgbe_parse_vsi_func_caps(hw, func_p, &cap_resp[i]); break; diff --git a/drivers/net/ixgbe/base/ixgbe_type_e610.h b/drivers/net/ixgbe/base/ixgbe_type_e610.h index 9e72053e2a..dcb874e42e 100644 --- a/drivers/net/ixgbe/base/ixgbe_type_e610.h +++ b/drivers/net/ixgbe/base/ixgbe_type_e610.h @@ -672,6 +672,8 @@ struct ixgbe_aci_cmd_list_caps_elem { __le16 cap; #define IXGBE_ACI_CAPS_VALID_FUNCTIONS 0x0005 #define IXGBE_ACI_MAX_VALID_FUNCTIONS 0x8 +#define IXGBE_ACI_CAPS_SRIOV 0x0012 +#define IXGBE_ACI_CAPS_VF 0x0013 #define IXGBE_ACI_CAPS_VMDQ0x0014 #define IXGBE_ACI_CAPS_VSI 0x0017 #define IXGBE_ACI_CAPS_DCB 0x0018 @@ -1954,6 +1956,8 @@ struct ixgbe_hw_common_caps { #define IXGBE_MAX_SUPPORTED_GPIO_SDP 8 u8 led[IXGBE_MAX_SUPPORTED_GPIO_LED]; u8 sdp[IXGBE_MAX_SUPPORTED_GPIO_SDP]; +
[PATCH v1 03/15] net/ixgbe/base: add missing E610 definitions
When adding support for E610 bringup, some definitions and code paths were accidentally omitted due to the way the shared driver snapshot was created. Add missing definitions and code paths. Fixes: 316637762a5f ("net/ixgbe/base: enable E610 device") Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_api.c | 5 + drivers/net/ixgbe/base/ixgbe_type.h | 2 ++ drivers/net/ixgbe/base/ixgbe_type_e610.h | 3 +++ 3 files changed, 10 insertions(+) diff --git a/drivers/net/ixgbe/base/ixgbe_api.c b/drivers/net/ixgbe/base/ixgbe_api.c index c8f9a6d9f1..b4920867bc 100644 --- a/drivers/net/ixgbe/base/ixgbe_api.c +++ b/drivers/net/ixgbe/base/ixgbe_api.c @@ -87,6 +87,7 @@ s32 ixgbe_init_shared_code(struct ixgbe_hw *hw) case ixgbe_mac_X550_vf: case ixgbe_mac_X550EM_x_vf: case ixgbe_mac_X550EM_a_vf: + case ixgbe_mac_E610_vf: status = ixgbe_init_ops_vf(hw); break; case ixgbe_mac_E610: @@ -219,6 +220,10 @@ s32 ixgbe_set_mac_type(struct ixgbe_hw *hw) hw->mac.type = ixgbe_mac_E610; hw->mvals = ixgbe_mvals_X550EM_a; break; + case IXGBE_DEV_ID_E610_VF: + hw->mac.type = ixgbe_mac_E610_vf; + hw->mvals = ixgbe_mvals_X550EM_a; + break; default: ret_val = IXGBE_ERR_DEVICE_NOT_SUPPORTED; ERROR_REPORT2(IXGBE_ERROR_UNSUPPORTED, diff --git a/drivers/net/ixgbe/base/ixgbe_type.h b/drivers/net/ixgbe/base/ixgbe_type.h index d86049426e..f6d5052c65 100644 --- a/drivers/net/ixgbe/base/ixgbe_type.h +++ b/drivers/net/ixgbe/base/ixgbe_type.h @@ -130,6 +130,7 @@ #define IXGBE_DEV_ID_E610_10G_T0x57B0 #define IXGBE_DEV_ID_E610_2_5G_T 0x57B1 #define IXGBE_DEV_ID_E610_SGMII0x57B2 +#define IXGBE_DEV_ID_E610_VF 0x57AD #define IXGBE_CAT(r, m) IXGBE_##r##m @@ -3676,6 +3677,7 @@ enum ixgbe_mac_type { ixgbe_mac_X550EM_x_vf, ixgbe_mac_X550EM_a_vf, ixgbe_mac_E610, + ixgbe_mac_E610_vf, ixgbe_num_macs }; diff --git a/drivers/net/ixgbe/base/ixgbe_type_e610.h b/drivers/net/ixgbe/base/ixgbe_type_e610.h index dcb874e42e..ab57852f19 100644 --- a/drivers/net/ixgbe/base/ixgbe_type_e610.h +++ b/drivers/net/ixgbe/base/ixgbe_type_e610.h @@ -2080,6 +2080,8 @@ struct ixgbe_orom_civd_info { /* Function specific capabilities */ struct ixgbe_hw_func_caps { struct ixgbe_hw_common_caps common_cap; + u32 num_allocd_vfs; /* Number of allocated VFs */ + u32 vf_base_id; /* Logical ID of the first VF */ u32 guar_num_vsi; struct ixgbe_ts_func_info ts_func_info; bool no_drop_policy_ena; @@ -2088,6 +2090,7 @@ struct ixgbe_hw_func_caps { /* Device wide capabilities */ struct ixgbe_hw_dev_caps { struct ixgbe_hw_common_caps common_cap; + u32 num_vfs_exposed;/* Total number of VFs exposed */ u32 num_vsi_allocd_to_host; /* Excluding EMP VSI */ u32 num_flow_director_fltr; /* Number of FD filters available */ struct ixgbe_ts_dev_info ts_dev_info; -- 2.43.5
[PATCH v1 04/15] net/ixgbe/base: add missing legacy mailbox API
When the new mailbox API was introduced, the legacy mailbox API was also provided, but was missing from the patches due to the way the patches were generated. This patch adds the missing legacy mailbox API to the driver. Fixes: 6d243d2caf2c ("net/ixgbe/base: introduce new mailbox API") Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_mbx.c | 44 ++ drivers/net/ixgbe/base/ixgbe_mbx.h | 1 + 2 files changed, 45 insertions(+) diff --git a/drivers/net/ixgbe/base/ixgbe_mbx.c b/drivers/net/ixgbe/base/ixgbe_mbx.c index 444a0d339d..23659266d0 100644 --- a/drivers/net/ixgbe/base/ixgbe_mbx.c +++ b/drivers/net/ixgbe/base/ixgbe_mbx.c @@ -169,6 +169,26 @@ s32 ixgbe_check_for_rst(struct ixgbe_hw *hw, u16 mbx_id) return ret_val; } +/** + * ixgbe_clear_mbx - Clear Mailbox Memory + * @hw: pointer to the HW structure + * @mbx_id: id of mailbox to write + * + * Set VFMBMEM of given VF to 0x0. + **/ +s32 ixgbe_clear_mbx(struct ixgbe_hw *hw, u16 mbx_id) +{ + struct ixgbe_mbx_info *mbx = &hw->mbx; + s32 ret_val = IXGBE_ERR_CONFIG; + + DEBUGFUNC("ixgbe_clear_mbx"); + + if (mbx->ops[mbx_id].clear) + ret_val = mbx->ops[mbx_id].clear(hw, mbx_id); + + return ret_val; +} + /** * ixgbe_poll_for_msg - Wait for message notification * @hw: pointer to the HW structure @@ -628,6 +648,7 @@ void ixgbe_init_mbx_params_vf(struct ixgbe_hw *hw) mbx->ops[0].check_for_msg = ixgbe_check_for_msg_vf; mbx->ops[0].check_for_ack = ixgbe_check_for_ack_vf; mbx->ops[0].check_for_rst = ixgbe_check_for_rst_vf; + mbx->ops[0].clear = NULL; mbx->stats.msgs_tx = 0; mbx->stats.msgs_rx = 0; @@ -1024,6 +1045,27 @@ STATIC s32 ixgbe_read_mbx_pf(struct ixgbe_hw *hw, u32 *msg, u16 size, return IXGBE_SUCCESS; } +/** + * ixgbe_clear_mbx_pf - Clear Mailbox Memory + * @hw: pointer to the HW structure + * @vf_id: the VF index + * + * Set VFMBMEM of given VF to 0x0. + **/ +STATIC s32 ixgbe_clear_mbx_pf(struct ixgbe_hw *hw, u16 vf_id) +{ + u16 mbx_size = hw->mbx.size; + u16 i; + + if (vf_id > 63) + return IXGBE_ERR_PARAM; + + for (i = 0; i < mbx_size; ++i) + IXGBE_WRITE_REG_ARRAY(hw, IXGBE_PFMBMEM(vf_id), i, 0x0); + + return IXGBE_SUCCESS; +} + /** * ixgbe_init_mbx_params_pf_id - set initial values for pf mailbox * @hw: pointer to the HW structure @@ -1042,6 +1084,7 @@ void ixgbe_init_mbx_params_pf_id(struct ixgbe_hw *hw, u16 vf_id) mbx->ops[vf_id].check_for_msg = ixgbe_check_for_msg_pf; mbx->ops[vf_id].check_for_ack = ixgbe_check_for_ack_pf; mbx->ops[vf_id].check_for_rst = ixgbe_check_for_rst_pf; + mbx->ops[vf_id].clear = ixgbe_clear_mbx_pf; } /** @@ -1119,6 +1162,7 @@ void ixgbe_upgrade_mbx_params_pf(struct ixgbe_hw *hw, u16 vf_id) mbx->ops[vf_id].check_for_msg = ixgbe_check_for_msg_pf; mbx->ops[vf_id].check_for_ack = ixgbe_check_for_ack_pf; mbx->ops[vf_id].check_for_rst = ixgbe_check_for_rst_pf; + mbx->ops[vf_id].clear = ixgbe_clear_mbx_pf; mbx->stats.msgs_tx = 0; mbx->stats.msgs_rx = 0; diff --git a/drivers/net/ixgbe/base/ixgbe_mbx.h b/drivers/net/ixgbe/base/ixgbe_mbx.h index 56ab435286..434f7c6a69 100644 --- a/drivers/net/ixgbe/base/ixgbe_mbx.h +++ b/drivers/net/ixgbe/base/ixgbe_mbx.h @@ -168,6 +168,7 @@ s32 ixgbe_write_mbx(struct ixgbe_hw *hw, u32 *msg, u16 size, u16 mbx_id); s32 ixgbe_check_for_msg(struct ixgbe_hw *hw, u16 mbx_id); s32 ixgbe_check_for_ack(struct ixgbe_hw *hw, u16 mbx_id); s32 ixgbe_check_for_rst(struct ixgbe_hw *hw, u16 mbx_id); +s32 ixgbe_clear_mbx(struct ixgbe_hw *hw, u16 vf_number); void ixgbe_init_mbx_params_vf(struct ixgbe_hw *hw); void ixgbe_upgrade_mbx_params_vf(struct ixgbe_hw *hw); void ixgbe_init_mbx_params_pf(struct ixgbe_hw *hw); -- 2.43.5
[PATCH v1 05/15] net/ixgbe/base: add E610 VF HV macro
From: Jedrzej Jagielski At this point there is no macro specific for E610 VF HV. Add it to ixgbe_type.h Signed-off-by: Jedrzej Jagielski Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_type.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ixgbe/base/ixgbe_type.h b/drivers/net/ixgbe/base/ixgbe_type.h index f6d5052c65..cc49eace91 100644 --- a/drivers/net/ixgbe/base/ixgbe_type.h +++ b/drivers/net/ixgbe/base/ixgbe_type.h @@ -131,6 +131,7 @@ #define IXGBE_DEV_ID_E610_2_5G_T 0x57B1 #define IXGBE_DEV_ID_E610_SGMII0x57B2 #define IXGBE_DEV_ID_E610_VF 0x57AD +#define IXGBE_SUBDEV_ID_E610_VF_HV 0x0001 #define IXGBE_CAT(r, m) IXGBE_##r##m -- 2.43.5
[PATCH v1 06/15] net/ixgbe/base: fix unchecked return value
From: Barbara Skobiej There was unchecked return value in the ixgbe_stop_mac_link_on_d3_82599 function. Added checking of return value from the called function ixgbe_read_eeprom. Signed-off-by: Barbara Skobiej Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_82599.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/ixgbe/base/ixgbe_82599.c b/drivers/net/ixgbe/base/ixgbe_82599.c index c4ad906f0f..3110477700 100644 --- a/drivers/net/ixgbe/base/ixgbe_82599.c +++ b/drivers/net/ixgbe/base/ixgbe_82599.c @@ -556,13 +556,15 @@ enum ixgbe_media_type ixgbe_get_media_type_82599(struct ixgbe_hw *hw) **/ void ixgbe_stop_mac_link_on_d3_82599(struct ixgbe_hw *hw) { - u32 autoc2_reg; u16 ee_ctrl_2 = 0; + u32 autoc2_reg; + u32 status; DEBUGFUNC("ixgbe_stop_mac_link_on_d3_82599"); - ixgbe_read_eeprom(hw, IXGBE_EEPROM_CTRL_2, &ee_ctrl_2); + status = ixgbe_read_eeprom(hw, IXGBE_EEPROM_CTRL_2, &ee_ctrl_2); - if (!ixgbe_mng_present(hw) && !hw->wol_enabled && + if (status == IXGBE_SUCCESS && + !ixgbe_mng_present(hw) && !hw->wol_enabled && ee_ctrl_2 & IXGBE_EEPROM_CCD_BIT) { autoc2_reg = IXGBE_READ_REG(hw, IXGBE_AUTOC2); autoc2_reg |= IXGBE_AUTOC2_LINK_DISABLE_ON_D3_MASK; -- 2.43.5
[PATCH v1 07/15] net/ixgbe/base: fix media type handling for E610
From: Krzysztof Galazka Media type information should not be updated by ixgbe_aci_get_link_info function because it will be incorrectly set as unknown when link is down. Do it only in ixgbe_get_media_type_E610. Signed-off-by: Krzysztof Galazka Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_e610.c | 8 +++- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c b/drivers/net/ixgbe/base/ixgbe_e610.c index 1f3a4532da..b9b1ba32c3 100644 --- a/drivers/net/ixgbe/base/ixgbe_e610.c +++ b/drivers/net/ixgbe/base/ixgbe_e610.c @@ -1683,7 +1683,6 @@ s32 ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse, struct ixgbe_aci_cmd_get_link_status *resp; struct ixgbe_link_status *li_old, *li; struct ixgbe_fc_info *hw_fc_info; - enum ixgbe_media_type *hw_media_type; struct ixgbe_aci_desc desc; bool tx_pause, rx_pause; u8 cmd_flags; @@ -1693,7 +1692,6 @@ s32 ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse, return IXGBE_ERR_PARAM; li_old = &hw->link.link_info_old; - hw_media_type = &hw->phy.media_type; li = &hw->link.link_info; hw_fc_info = &hw->fc; @@ -1714,7 +1712,6 @@ s32 ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool ena_lse, li->link_speed = IXGBE_LE16_TO_CPU(link_data.link_speed); li->phy_type_low = IXGBE_LE64_TO_CPU(link_data.phy_type_low); li->phy_type_high = IXGBE_LE64_TO_CPU(link_data.phy_type_high); - *hw_media_type = ixgbe_get_media_type_from_phy_type(hw); li->link_info = link_data.link_info; li->link_cfg_err = link_data.link_cfg_err; li->an_info = link_data.an_info; @@ -3664,10 +3661,11 @@ enum ixgbe_media_type ixgbe_get_media_type_E610(struct ixgbe_hw *hw) } } - /* Based on search above try to discover media type */ - hw->phy.media_type = ixgbe_get_media_type_from_phy_type(hw); } + /* Based on link status or search above try to discover media type */ + hw->phy.media_type = ixgbe_get_media_type_from_phy_type(hw); + return hw->phy.media_type; } -- 2.43.5
[PATCH v1 08/15] net/ixgbe/base: fix speed autonegotiation on E610
From: Krzysztof Galazka When user changed advertised speed settings and link was already up driver asked FW only for active PHY configuration. This prevented it from adding speeds, which are supported but was earlier disabled by user. Get all speeds supported by HW to allow user enabling any of them. Signed-off-by: Krzysztof Galazka Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_e610.c | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c b/drivers/net/ixgbe/base/ixgbe_e610.c index b9b1ba32c3..6eaf377f4a 100644 --- a/drivers/net/ixgbe/base/ixgbe_e610.c +++ b/drivers/net/ixgbe/base/ixgbe_e610.c @@ -4342,7 +4342,8 @@ s32 ixgbe_setup_phy_link_E610(struct ixgbe_hw *hw) { struct ixgbe_aci_cmd_get_phy_caps_data pcaps; struct ixgbe_aci_cmd_set_phy_cfg_data pcfg; - u8 rmode = IXGBE_ACI_REPORT_ACTIVE_CFG; + u8 rmode = IXGBE_ACI_REPORT_TOPO_CAP_MEDIA; + u64 sup_phy_type_low, sup_phy_type_high; s32 rc; rc = ixgbe_aci_get_link_info(hw, false, NULL); @@ -4359,6 +4360,15 @@ s32 ixgbe_setup_phy_link_E610(struct ixgbe_hw *hw) goto err; } + sup_phy_type_low = pcaps.phy_type_low; + sup_phy_type_high = pcaps.phy_type_high; + + /* Get Active configuration to avoid unintended changes */ + rc = ixgbe_aci_get_phy_caps(hw, false, IXGBE_ACI_REPORT_ACTIVE_CFG, + &pcaps); + if (rc) { + goto err; + } ixgbe_copy_phy_caps_to_cfg(&pcaps, &pcfg); /* Set default PHY types for a given speed */ @@ -4406,8 +4416,8 @@ s32 ixgbe_setup_phy_link_E610(struct ixgbe_hw *hw) } /* Mask the set values to avoid requesting unsupported link types */ - pcfg.phy_type_low &= pcaps.phy_type_low; - pcfg.phy_type_high &= pcaps.phy_type_high; + pcfg.phy_type_low &= sup_phy_type_low; + pcfg.phy_type_high &= sup_phy_type_high; if (pcfg.phy_type_high != pcaps.phy_type_high || pcfg.phy_type_low != pcaps.phy_type_low || -- 2.43.5
[PATCH v1 09/15] net/ixgbe/base: FW API version update
From: Pawel Malinowski Update FW API version to 1.7. Signed-off-by: Pawel Malinowski Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_type_e610.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ixgbe/base/ixgbe_type_e610.h b/drivers/net/ixgbe/base/ixgbe_type_e610.h index ab57852f19..bad332c6b8 100644 --- a/drivers/net/ixgbe/base/ixgbe_type_e610.h +++ b/drivers/net/ixgbe/base/ixgbe_type_e610.h @@ -351,7 +351,7 @@ */ #define IXGBE_FW_API_VER_BRANCH0x00 #define IXGBE_FW_API_VER_MAJOR 0x01 -#define IXGBE_FW_API_VER_MINOR 0x05 +#define IXGBE_FW_API_VER_MINOR 0x07 #define IXGBE_FW_API_VER_DIFF_ALLOWED 0x02 #define IXGBE_ACI_DESC_SIZE32 -- 2.43.5
[PATCH v1 10/15] net/ixgbe/base: handle 5G link speed for E610
From: Piotr Kwapulinski When detecting the 5G link speed take into account the E610 VF MAC type in ixgbe_check_mac_link_vf(). Signed-off-by: Piotr Kwapulinski Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_vf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ixgbe/base/ixgbe_vf.c b/drivers/net/ixgbe/base/ixgbe_vf.c index 0d5b29ba50..37556a9300 100644 --- a/drivers/net/ixgbe/base/ixgbe_vf.c +++ b/drivers/net/ixgbe/base/ixgbe_vf.c @@ -628,7 +628,8 @@ s32 ixgbe_check_mac_link_vf(struct ixgbe_hw *hw, ixgbe_link_speed *speed, break; case IXGBE_LINKS_SPEED_100_82599: *speed = IXGBE_LINK_SPEED_100_FULL; - if (hw->mac.type == ixgbe_mac_X550_vf) { + if (hw->mac.type == ixgbe_mac_X550_vf || + hw->mac.type == ixgbe_mac_E610_vf) { if (links_reg & IXGBE_LINKS_SPEED_NON_STD) *speed = IXGBE_LINK_SPEED_5GB_FULL; } -- 2.43.5
[PATCH v1 11/15] net/ixgbe/base: remove FW API version check
From: Krzysztof Galazka Only certain variants of drivers rely on FW API version check in shared code. Other drivers implement their own logic due to differences in requirements. DPDK does not require the FW API check. Signed-off-by: Krzysztof Galazka Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_e610.c | 31 + 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c b/drivers/net/ixgbe/base/ixgbe_e610.c index 6eaf377f4a..7ea495db97 100644 --- a/drivers/net/ixgbe/base/ixgbe_e610.c +++ b/drivers/net/ixgbe/base/ixgbe_e610.c @@ -3541,32 +3541,7 @@ s32 ixgbe_reset_hw_E610(struct ixgbe_hw *hw) reset_hw_out: return status; } -/** - * ixgbe_fw_ver_check - Check the reported FW API version - * @hw: pointer to the hardware structure - * - * Checks if the driver should load on a given FW API version. - * - * Return: 'true' if the driver should attempt to load. 'false' otherwise. - */ -static bool ixgbe_fw_ver_check(struct ixgbe_hw *hw) -{ - if (hw->api_maj_ver > IXGBE_FW_API_VER_MAJOR) { - ERROR_REPORT1(IXGBE_ERROR_UNSUPPORTED, "The driver for the device stopped because the NVM image is newer than expected. You must install the most recent version of the network driver.\n"); - return false; - } else if (hw->api_maj_ver == IXGBE_FW_API_VER_MAJOR) { - if (hw->api_min_ver > - (IXGBE_FW_API_VER_MINOR + IXGBE_FW_API_VER_DIFF_ALLOWED)) { - ERROR_REPORT1(IXGBE_ERROR_CAUTION, "The driver for the device detected a newer version of the NVM image than expected. Please install the most recent version of the network driver.\n"); - } else if ((hw->api_min_ver + IXGBE_FW_API_VER_DIFF_ALLOWED) < - IXGBE_FW_API_VER_MINOR) { - ERROR_REPORT1(IXGBE_ERROR_CAUTION, "The driver for the device detected an older version of the NVM image than expected. Please update the NVM image.\n"); - } - } else { - ERROR_REPORT1(IXGBE_ERROR_CAUTION, "The driver for the device detected an older version of the NVM image than expected. Please update the NVM image.\n"); - } - return true; -} + /** * ixgbe_start_hw_E610 - Prepare hardware for Tx/Rx * @hw: pointer to hardware structure @@ -3584,10 +3559,6 @@ s32 ixgbe_start_hw_E610(struct ixgbe_hw *hw) if (ret_val) goto out; - if (!ixgbe_fw_ver_check(hw)) { - ret_val = IXGBE_ERR_FW_API_VER; - goto out; - } ret_val = ixgbe_start_hw_generic(hw); if (ret_val != IXGBE_SUCCESS) goto out; -- 2.43.5
[PATCH v1 12/15] net/ixgbe/base: disable thermal sensor ops for E610
From: Andrzej Wilczynski According to data sheet, E610 doesn't expose current reading from thermal sensors. Currently, E610 sensor ops are the same as for X540, which will include the unsupported op. This patch disables those ops for E610 to avoid attempts to read those sensors. Signed-off-by: Andrzej Wilczynski Co-authored-by: RemigiuszX Konca Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_e610.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c b/drivers/net/ixgbe/base/ixgbe_e610.c index 7ea495db97..ab02b11d6a 100644 --- a/drivers/net/ixgbe/base/ixgbe_e610.c +++ b/drivers/net/ixgbe/base/ixgbe_e610.c @@ -3431,6 +3431,8 @@ s32 ixgbe_init_ops_E610(struct ixgbe_hw *hw) mac->ops.get_fw_tsam_mode = ixgbe_get_fw_tsam_mode_E610; mac->ops.get_fw_version = ixgbe_aci_get_fw_ver; mac->ops.get_nvm_version = ixgbe_get_active_nvm_ver; + mac->ops.get_thermal_sensor_data = NULL; + mac->ops.init_thermal_sensor_thresh = NULL; /* PHY */ phy->ops.init = ixgbe_init_phy_ops_E610; -- 2.43.5
[PATCH v1 13/15] net/ixgbe/base: fix mailbox ACK handling
From: NorbertX Ciosek Check if CTS bit is set in the mailbox message before waiting for ACK. Otherwise ACK will never be received causing the function to timeout. Add a note for ixgbe_write_mbx that it should be called while holding a lock. Signed-off-by: NorbertX Ciosek Signed-off-by: Anatoly Burakov --- drivers/net/ixgbe/base/ixgbe_mbx.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ixgbe/base/ixgbe_mbx.c b/drivers/net/ixgbe/base/ixgbe_mbx.c index 23659266d0..fb8ea8ca68 100644 --- a/drivers/net/ixgbe/base/ixgbe_mbx.c +++ b/drivers/net/ixgbe/base/ixgbe_mbx.c @@ -82,6 +82,9 @@ s32 ixgbe_poll_mbx(struct ixgbe_hw *hw, u32 *msg, u16 size, u16 mbx_id) * * returns SUCCESS if it successfully copied message into the buffer and * received an ACK to that message within specified period + * + * Note that the caller to this function must lock before calling, since + * multiple threads can destroy each other messages. **/ s32 ixgbe_write_mbx(struct ixgbe_hw *hw, u32 *msg, u16 size, u16 mbx_id) { @@ -836,6 +839,11 @@ STATIC s32 ixgbe_obtain_mbx_lock_pf(struct ixgbe_hw *hw, u16 vf_id) while (countdown--) { /* Reserve mailbox for PF use */ pf_mailbox = IXGBE_READ_REG(hw, IXGBE_PFMAILBOX(vf_id)); + + /* Check if other thread holds the PF lock already */ + if (pf_mailbox & IXGBE_PFMAILBOX_PFU) + goto retry; + pf_mailbox |= IXGBE_PFMAILBOX_PFU; IXGBE_WRITE_REG(hw, IXGBE_PFMAILBOX(vf_id), pf_mailbox); @@ -846,6 +854,7 @@ STATIC s32 ixgbe_obtain_mbx_lock_pf(struct ixgbe_hw *hw, u16 vf_id) break; } + retry: /* Wait a bit before trying again */ usec_delay(mbx->usec_delay); } @@ -948,13 +957,14 @@ STATIC s32 ixgbe_write_mbx_pf(struct ixgbe_hw *hw, u32 *msg, u16 size, for (i = 0; i < size; i++) IXGBE_WRITE_REG_ARRAY(hw, IXGBE_PFMBMEM(vf_id), i, msg[i]); - /* Interrupt VF to tell it a message has been sent */ + /* interrupt VF to tell it a message has been sent */ pf_mailbox = IXGBE_READ_REG(hw, IXGBE_PFMAILBOX(vf_id)); pf_mailbox |= IXGBE_PFMAILBOX_STS; IXGBE_WRITE_REG(hw, IXGBE_PFMAILBOX(vf_id), pf_mailbox); /* if msg sent wait until we receive an ack */ - ixgbe_poll_for_ack(hw, vf_id); + if (msg[0] & IXGBE_VT_MSGTYPE_CTS) + ixgbe_poll_for_ack(hw, vf_id); /* update stats */ hw->mbx.stats.msgs_tx++; -- 2.43.5