from:"Anatoly Burakov"

[dpdk-dev] [21.08 PATCH v3 1/1] power: refactor pstate sysfs handling

2021-04-22 Thread Anatoly Burakov

Currently, pstate sysfs handling code is a bit of an unmaintainable
mess, which has contributed to various errors leading to bugs. Refactor
the code in a way that makes it more maintainable and less error prone.

Signed-off-by: Anatoly Burakov 
---
 lib/power/meson.build|   7 +
 lib/power/power_pstate_cpufreq.c | 357 ---
 2 files changed, 191 insertions(+), 173 deletions(-)

diff --git a/lib/power/meson.build b/lib/power/meson.build
index a2cc9fe2ef..85324d48d2 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -5,6 +5,13 @@ if not is_linux
 build = false
 reason = 'only supported on Linux'
 endif
+
+# we do some snprintf magic so silence format-nonliteral
+flag_nonliteral = '-Wno-format-nonliteral'
+if cc.has_argument(flag_nonliteral)
+   cflags += flag_nonliteral
+endif
+
 sources = files(
 'guest_channel.c',
 'power_acpi_cpufreq.c',
diff --git a/lib/power/power_pstate_cpufreq.c b/lib/power/power_pstate_cpufreq.c
index 2cfc54acf3..4357ac4920 100644
--- a/lib/power/power_pstate_cpufreq.c
+++ b/lib/power/power_pstate_cpufreq.c
@@ -37,6 +37,13 @@
} \
 } while (0)
 
+#define FOPEN_OR_ERR_GOTO(f, label) do { \
+   if ((f) == NULL) { \
+   RTE_LOG(ERR, POWER, "File not opened\n"); \
+   goto label; \
+   } \
+} while (0)
+
 #define FOPS_OR_NULL_GOTO(ret, label) do { \
if ((ret) == NULL) { \
RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \
@@ -148,97 +155,145 @@ out: close(fd);
return ret;
 }
 
+static int
+open_core_sysfs_file(const char *template, unsigned int core, const char *mode,
+   FILE **f)
+{
+   char fullpath[PATH_MAX];
+   FILE *tmpf;
+
+   /* silenced -Wformat-nonliteral here */
+   snprintf(fullpath, sizeof(fullpath), template, core);
+   tmpf = fopen(fullpath, mode);
+   if (tmpf == NULL)
+   return -1;
+   *f = tmpf;
+
+   return 0;
+}
+
+static int
+read_core_sysfs_u32(FILE *f, uint32_t *val)
+{
+   char buf[BUFSIZ];
+   uint32_t fval;
+   char *s;
+
+   s = fgets(buf, sizeof(buf), f);
+   if (s == NULL)
+   return -1;
+
+   /* fgets puts null terminator in, but do this just in case */
+   buf[BUFSIZ - 1] = '\0';
+
+   /* strip off any terminating newlines */
+   *strchrnul(buf, '\n') = '\0';
+
+   fval = strtoul(buf, NULL, POWER_CONVERT_TO_DECIMAL);
+
+   /* write the value */
+   *val = fval;
+
+   return 0;
+}
+
+static int
+read_core_sysfs_s(FILE *f, char *buf, unsigned int len)
+{
+   char *s;
+
+   s = fgets(buf, len, f);
+   if (s == NULL)
+   return -1;
+
+   /* fgets puts null terminator in, but do this just in case */
+   buf[len - 1] = '\0';
+
+   /* strip off any terminating newlines */
+   *strchrnul(buf, '\n') = '\0';
+
+   return 0;
+}
+
+static int
+write_core_sysfs_s(FILE *f, const char *str)
+{
+   int ret;
+
+   ret = fseek(f, 0, SEEK_SET);
+   if (ret != 0)
+   return -1;
+
+   ret = fputs(str, f);
+   if (ret != 0)
+   return -1;
+
+   /* flush the output */
+   ret = fflush(f);
+   if (ret != 0)
+   return -1;
+
+   return 0;
+}
+
 /**
  * It is to fopen the sys file for the future setting the lcore frequency.
  */
 static int
 power_init_for_setting_freq(struct pstate_power_info *pi)
 {
-   FILE *f_min, *f_max, *f_base = NULL, *f_base_max;
-   char fullpath_min[PATH_MAX];
-   char fullpath_max[PATH_MAX];
-   char fullpath_base[PATH_MAX];
-   char fullpath_base_max[PATH_MAX];
-   char buf_base[BUFSIZ];
-   char *s_base;
-   char *s_base_max;
-   uint32_t base_ratio = 0;
-   uint32_t base_max_ratio = 0;
-   uint64_t max_non_turbo = 0;
-   int  ret_val = 0;
-
-   snprintf(fullpath_base_max,
-   sizeof(fullpath_base_max),
-   POWER_SYSFILE_BASE_MAX_FREQ,
-   pi->lcore_id);
-   f_base_max = fopen(fullpath_base_max, "r");
-   FOPEN_OR_ERR_RET(f_base_max, -1);
-   if (f_base_max != NULL) {
-   s_base_max = fgets(buf_base, sizeof(buf_base), f_base_max);
-
-   /* close the file unconditionally */
-   fclose(f_base_max);
-   f_base_max = NULL;
-
-   FOPS_OR_NULL_GOTO(s_base_max, out);
-
-   buf_base[BUFSIZ-1] = '\0';
-   if (strlen(buf_base))
-   /* Strip off terminating '\n' */
-   strtok(buf_base, "\n");
-
-   base_max_ratio =
-   strtoul(buf_base, NULL, POWER_CONVERT_TO_DECIMAL)
-

[dpdk-dev] [PATCH v1 1/1] power: do not skip saving original acpi governor

2021-04-23 Thread Anatoly Burakov

Currently, when we set the acpi governor to "userspace", we check if
it is already set to this value, and if it is, we skip setting it.

However, we never save this value anywhere, so that next time we come
back and request the governor to be set to its original value, the
original value is empty.

Fix it by saving the original pstate governor first. While we're at it,
replace `strlcpy` with `rte_strscpy`.

Fixes: 445c6528b55f ("power: common interface for guest and host")
Cc: david.h...@intel.com

Signed-off-by: Anatoly Burakov 
---
 lib/power/power_acpi_cpufreq.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/power/power_acpi_cpufreq.c b/lib/power/power_acpi_cpufreq.c
index 84a9d75207..d028a9947f 100644
--- a/lib/power/power_acpi_cpufreq.c
+++ b/lib/power/power_acpi_cpufreq.c
@@ -152,6 +152,9 @@ power_set_governor_userspace(struct rte_power_info *pi)
/* Strip off terminating '\n' */
strtok(buf, "\n");
 
+   /* Save the original governor */
+   rte_strscpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
+
/* Check if current governor is userspace */
if (strncmp(buf, POWER_GOVERNOR_USERSPACE,
sizeof(POWER_GOVERNOR_USERSPACE)) == 0) {
@@ -160,8 +163,6 @@ power_set_governor_userspace(struct rte_power_info *pi)
"already userspace\n", pi->lcore_id);
goto out;
}
-   /* Save the original governor */
-   strlcpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
 
/* Write 'userspace' to the governor */
val = fseek(f, 0, SEEK_SET);
-- 
2.25.1

[dpdk-dev] [21.08 PATCH v4 1/2] power: don't use rte prefix in internal code

2021-04-23 Thread Anatoly Burakov

Currently, ACPI code uses rte_power_info as the struct name, which
gives the appearance that this is an externally visible API. Fix to
use internal namespace.

Signed-off-by: Anatoly Burakov 
---
 lib/power/power_acpi_cpufreq.c | 34 +-
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/lib/power/power_acpi_cpufreq.c b/lib/power/power_acpi_cpufreq.c
index d028a9947f..1b8c69cc8b 100644
--- a/lib/power/power_acpi_cpufreq.c
+++ b/lib/power/power_acpi_cpufreq.c
@@ -78,7 +78,7 @@ enum power_state {
 /**
  * Power info per lcore.
  */
-struct rte_power_info {
+struct acpi_power_info {
unsigned int lcore_id;   /**< Logical core id */
uint32_t freqs[RTE_MAX_LCORE_FREQS]; /**< Frequency array */
uint32_t nb_freqs;   /**< number of available freqs */
@@ -90,14 +90,14 @@ struct rte_power_info {
uint16_t turbo_enable;   /**< Turbo Boost enable/disable */
 } __rte_cache_aligned;
 
-static struct rte_power_info lcore_power_info[RTE_MAX_LCORE];
+static struct acpi_power_info lcore_power_info[RTE_MAX_LCORE];
 
 /**
  * It is to set specific freq for specific logical core, according to the index
  * of supported frequencies.
  */
 static int
-set_freq_internal(struct rte_power_info *pi, uint32_t idx)
+set_freq_internal(struct acpi_power_info *pi, uint32_t idx)
 {
if (idx >= RTE_MAX_LCORE_FREQS || idx >= pi->nb_freqs) {
RTE_LOG(ERR, POWER, "Invalid frequency index %u, which "
@@ -133,7 +133,7 @@ set_freq_internal(struct rte_power_info *pi, uint32_t idx)
  * governor will be saved for rolling back.
  */
 static int
-power_set_governor_userspace(struct rte_power_info *pi)
+power_set_governor_userspace(struct acpi_power_info *pi)
 {
FILE *f;
int ret = -1;
@@ -189,7 +189,7 @@ power_set_governor_userspace(struct rte_power_info *pi)
  * sys file.
  */
 static int
-power_get_available_freqs(struct rte_power_info *pi)
+power_get_available_freqs(struct acpi_power_info *pi)
 {
FILE *f;
int ret = -1, i, count;
@@ -259,7 +259,7 @@ power_get_available_freqs(struct rte_power_info *pi)
  * It is to fopen the sys file for the future setting the lcore frequency.
  */
 static int
-power_init_for_setting_freq(struct rte_power_info *pi)
+power_init_for_setting_freq(struct acpi_power_info *pi)
 {
FILE *f;
char fullpath[PATH_MAX];
@@ -299,7 +299,7 @@ power_acpi_cpufreq_check_supported(void)
 int
 power_acpi_cpufreq_init(unsigned int lcore_id)
 {
-   struct rte_power_info *pi;
+   struct acpi_power_info *pi;
uint32_t exp_state;
 
if (lcore_id >= RTE_MAX_LCORE) {
@@ -374,7 +374,7 @@ power_acpi_cpufreq_init(unsigned int lcore_id)
  * needed by writing the sys file.
  */
 static int
-power_set_governor_original(struct rte_power_info *pi)
+power_set_governor_original(struct acpi_power_info *pi)
 {
FILE *f;
int ret = -1;
@@ -420,7 +420,7 @@ power_set_governor_original(struct rte_power_info *pi)
 int
 power_acpi_cpufreq_exit(unsigned int lcore_id)
 {
-   struct rte_power_info *pi;
+   struct acpi_power_info *pi;
uint32_t exp_state;
 
if (lcore_id >= RTE_MAX_LCORE) {
@@ -475,7 +475,7 @@ power_acpi_cpufreq_exit(unsigned int lcore_id)
 uint32_t
 power_acpi_cpufreq_freqs(unsigned int lcore_id, uint32_t *freqs, uint32_t num)
 {
-   struct rte_power_info *pi;
+   struct acpi_power_info *pi;
 
if (lcore_id >= RTE_MAX_LCORE) {
RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
@@ -522,7 +522,7 @@ power_acpi_cpufreq_set_freq(unsigned int lcore_id, uint32_t 
index)
 int
 power_acpi_cpufreq_freq_down(unsigned int lcore_id)
 {
-   struct rte_power_info *pi;
+   struct acpi_power_info *pi;
 
if (lcore_id >= RTE_MAX_LCORE) {
RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
@@ -540,7 +540,7 @@ power_acpi_cpufreq_freq_down(unsigned int lcore_id)
 int
 power_acpi_cpufreq_freq_up(unsigned int lcore_id)
 {
-   struct rte_power_info *pi;
+   struct acpi_power_info *pi;
 
if (lcore_id >= RTE_MAX_LCORE) {
RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
@@ -581,7 +581,7 @@ power_acpi_cpufreq_freq_max(unsigned int lcore_id)
 int
 power_acpi_cpufreq_freq_min(unsigned int lcore_id)
 {
-   struct rte_power_info *pi;
+   struct acpi_power_info *pi;
 
if (lcore_id >= RTE_MAX_LCORE) {
RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
@@ -598,7 +598,7 @@ power_acpi_cpufreq_freq_min(unsigned int lcore_id)
 int
 power_acpi_turbo_status(unsigned int lcore_id)
 {
-   struct rte_power_info *pi;
+   struct acpi_power_info *pi;
 
if (lcore_id >= RTE_MAX_LCORE) {
RTE_LOG(ERR, POWER, "Invalid lcore ID\n");
@@ -614,7 +614,7 @@ power_acpi_turbo_status(unsigned int lcore_id)
 int
 power_acpi_enable

[dpdk-dev] [21.08 PATCH v4 2/2] power: refactor pstate and acpi code

2021-04-23 Thread Anatoly Burakov

Currently, ACPI and PSTATE modes have lots of code duplication,
confusing logic, and a bunch of other issues that can, and have, led to
various bugs and resource leaks.

This commit factors out the common parts of sysfs reading/writing for
ACPI and PSTATE drivers.

Signed-off-by: Anatoly Burakov 
---
 lib/power/meson.build|   7 +
 lib/power/power_acpi_cpufreq.c   | 178 +++--
 lib/power/power_common.c | 133 +
 lib/power/power_common.h |  46 +
 lib/power/power_pstate_cpufreq.c | 332 ---
 5 files changed, 293 insertions(+), 403 deletions(-)

diff --git a/lib/power/meson.build b/lib/power/meson.build
index a2cc9fe2ef..85324d48d2 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -5,6 +5,13 @@ if not is_linux
 build = false
 reason = 'only supported on Linux'
 endif
+
+# we do some snprintf magic so silence format-nonliteral
+flag_nonliteral = '-Wno-format-nonliteral'
+if cc.has_argument(flag_nonliteral)
+   cflags += flag_nonliteral
+endif
+
 sources = files(
 'guest_channel.c',
 'power_acpi_cpufreq.c',
diff --git a/lib/power/power_acpi_cpufreq.c b/lib/power/power_acpi_cpufreq.c
index 1b8c69cc8b..97f1d302c9 100644
--- a/lib/power/power_acpi_cpufreq.c
+++ b/lib/power/power_acpi_cpufreq.c
@@ -19,41 +19,10 @@
 #include "power_acpi_cpufreq.h"
 #include "power_common.h"
 
-#ifdef RTE_LIBRTE_POWER_DEBUG
-#define POWER_DEBUG_TRACE(fmt, args...) do { \
-   RTE_LOG(ERR, POWER, "%s: " fmt, __func__, ## args); \
-} while (0)
-#else
-#define POWER_DEBUG_TRACE(fmt, args...)
-#endif
-
-#define FOPEN_OR_ERR_RET(f, retval) do { \
-   if ((f) == NULL) { \
-   RTE_LOG(ERR, POWER, "File not opened\n"); \
-   return retval; \
-   } \
-} while (0)
-
-#define FOPS_OR_NULL_GOTO(ret, label) do { \
-   if ((ret) == NULL) { \
-   RTE_LOG(ERR, POWER, "fgets returns nothing\n"); \
-   goto label; \
-   } \
-} while (0)
-
-#define FOPS_OR_ERR_GOTO(ret, label) do { \
-   if ((ret) < 0) { \
-   RTE_LOG(ERR, POWER, "File operations failed\n"); \
-   goto label; \
-   } \
-} while (0)
-
 #define STR_SIZE 1024
 #define POWER_CONVERT_TO_DECIMAL 10
 
 #define POWER_GOVERNOR_USERSPACE "userspace"
-#define POWER_SYSFILE_GOVERNOR   \
-   "/sys/devices/system/cpu/cpu%u/cpufreq/scaling_governor"
 #define POWER_SYSFILE_AVAIL_FREQ \

"/sys/devices/system/cpu/cpu%u/cpufreq/scaling_available_frequencies"
 #define POWER_SYSFILE_SETSPEED   \
@@ -135,53 +104,18 @@ set_freq_internal(struct acpi_power_info *pi, uint32_t 
idx)
 static int
 power_set_governor_userspace(struct acpi_power_info *pi)
 {
-   FILE *f;
-   int ret = -1;
-   char buf[BUFSIZ];
-   char fullpath[PATH_MAX];
-   char *s;
-   int val;
-
-   snprintf(fullpath, sizeof(fullpath), POWER_SYSFILE_GOVERNOR,
-   pi->lcore_id);
-   f = fopen(fullpath, "rw+");
-   FOPEN_OR_ERR_RET(f, ret);
-
-   s = fgets(buf, sizeof(buf), f);
-   FOPS_OR_NULL_GOTO(s, out);
-   /* Strip off terminating '\n' */
-   strtok(buf, "\n");
-
-   /* Save the original governor */
-   rte_strscpy(pi->governor_ori, buf, sizeof(pi->governor_ori));
-
-   /* Check if current governor is userspace */
-   if (strncmp(buf, POWER_GOVERNOR_USERSPACE,
-   sizeof(POWER_GOVERNOR_USERSPACE)) == 0) {
-   ret = 0;
-   POWER_DEBUG_TRACE("Power management governor of lcore %u is "
-   "already userspace\n", pi->lcore_id);
-   goto out;
-   }
-
-   /* Write 'userspace' to the governor */
-   val = fseek(f, 0, SEEK_SET);
-   FOPS_OR_ERR_GOTO(val, out);
-
-   val = fputs(POWER_GOVERNOR_USERSPACE, f);
-   FOPS_OR_ERR_GOTO(val, out);
-
-   /* We need to flush to see if the fputs succeeds */
-   val = fflush(f);
-   FOPS_OR_ERR_GOTO(val, out);
-
-   ret = 0;
-   RTE_LOG(INFO, POWER, "Power management governor of lcore %u has been "
-   "set to user space successfully\n", pi->lcore_id);
-out:
-   fclose(f);
-
-   return ret;
+   return power_set_governor(pi->lcore_id, POWER_GOVERNOR_USERSPACE,
+   pi->governor_ori, sizeof(pi->governor_ori));
+}
+
+/**
+ * It is to check the governor and then set the original governor back if
+ * needed by writing the sys file.
+ */
+static int
+power_set_governor_original(struct acpi_power_info *pi)
+{
+   return power_set_governor(pi->lcore_id, p

[dpdk-dev] [PATCH v1 2/2] net/i40e: allow get_monitor_addr for VF driver

2021-04-26 Thread Anatoly Burakov

When .get_monitor_addr API was introduced, it was implemented in the
i40e driver, but only for the physical function; the virtual function
portion of the driver does not support that API.

Add the missing function pointer to VF device structure.

The i40e driver is not meant to use the VF portion any more, as
currently i40e VF devices are supposed to be managed by iavf drier, but
add this just in case it needs backporting later.

Fixes: a683abf90a22 ("net/i40e: implement power management API")

Signed-off-by: Anatoly Burakov 
---
 drivers/net/i40e/i40e_ethdev_vf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/i40e/i40e_ethdev_vf.c 
b/drivers/net/i40e/i40e_ethdev_vf.c
index 3c258ba7cf..156ad9ab96 100644
--- a/drivers/net/i40e/i40e_ethdev_vf.c
+++ b/drivers/net/i40e/i40e_ethdev_vf.c
@@ -216,6 +216,7 @@ static const struct eth_dev_ops i40evf_eth_dev_ops = {
.mtu_set  = i40evf_dev_mtu_set,
.mac_addr_set = i40evf_set_default_mac_addr,
.tx_done_cleanup  = i40e_tx_done_cleanup,
+   .get_monitor_addr = i40e_get_monitor_addr
 };
 
 /*
-- 
2.25.1

[dpdk-dev] [PATCH v1 1/2] net/ixgbe: allow get_monitor_addr for VF driver

2021-04-26 Thread Anatoly Burakov

When .get_monitor_addr API was introduced, it was implemented in the
ixgbe driver, but only for the physical function; the virtual function
portion of the driver does not support that API.

Add the missing function pointer to VF device structure.

Fixes: 3982b7967bb7 ("net/ixgbe: implement power management API")

Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/ixgbe_ethdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ixgbe/ixgbe_ethdev.c b/drivers/net/ixgbe/ixgbe_ethdev.c
index ff65145f55..6cca039a11 100644
--- a/drivers/net/ixgbe/ixgbe_ethdev.c
+++ b/drivers/net/ixgbe/ixgbe_ethdev.c
@@ -605,6 +605,7 @@ static const struct eth_dev_ops ixgbevf_eth_dev_ops = {
.rss_hash_update  = ixgbe_dev_rss_hash_update,
.rss_hash_conf_get= ixgbe_dev_rss_hash_conf_get,
.tx_done_cleanup  = ixgbe_dev_tx_done_cleanup,
+   .get_monitor_addr = ixgbe_get_monitor_addr,
 };
 
 /* store statistics names and its offset in stats structure */
-- 
2.25.1

[dpdk-dev] [PATCH v2 0/7] Enhancements for PMD power management

2021-06-25 Thread Anatoly Burakov

This patchset introduces several changes related to PMD power management:

- Changed monitoring intrinsics to use callbacks as a comparison function, based
  on previous patchset [1] but incorporating feedback [2] - this hopefully will
  make it possible to add support for .get_monitor_addr in virtio
- Add a new intrinsic to monitor multiple addresses, based on RTM instruction
  set and the TPAUSE instruction
- Add support for PMD power management on multiple queues, as well as all
  accompanying infrastructure and example apps changes

v2:
- Changed check inversion to callbacks
- Addressed feedback from Konstantin
- Added doc updates where necessary

[1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=*
[2] 
http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274

Anatoly Burakov (7):
  power_intrinsics: use callbacks for comparison
  net/af_xdp: add power monitor support
  eal: add power monitor for multiple events
  power: remove thread safety from PMD power API's
  power: support callbacks for multiple Rx queues
  power: support monitoring multiple Rx queues
  l3fwd-power: support multiqueue in PMD pmgmt modes

 doc/guides/prog_guide/power_man.rst   |  83 ++-
 doc/guides/rel_notes/release_21_08.rst|  11 +
 drivers/event/dlb2/dlb2.c |  16 +-
 drivers/net/af_xdp/rte_eth_af_xdp.c   |  33 +
 drivers/net/i40e/i40e_rxtx.c  |  19 +-
 drivers/net/iavf/iavf_rxtx.c  |  19 +-
 drivers/net/ice/ice_rxtx.c|  19 +-
 drivers/net/ixgbe/ixgbe_rxtx.c|  19 +-
 drivers/net/mlx5/mlx5_rx.c|  16 +-
 examples/l3fwd-power/main.c   |  39 +-
 lib/eal/arm/rte_power_intrinsics.c|  11 +
 lib/eal/include/generic/rte_cpuflags.h|   2 +
 .../include/generic/rte_power_intrinsics.h|  64 +-
 lib/eal/ppc/rte_power_intrinsics.c|  11 +
 lib/eal/version.map   |   3 +
 lib/eal/x86/rte_cpuflags.c|   2 +
 lib/eal/x86/rte_power_intrinsics.c|  78 ++-
 lib/power/meson.build |   3 +
 lib/power/rte_power_pmd_mgmt.c| 574 +-
 lib/power/rte_power_pmd_mgmt.h|  40 ++
 lib/power/version.map |   3 +
 21 files changed, 841 insertions(+), 224 deletions(-)

-- 
2.25.1

[dpdk-dev] [PATCH v2 1/7] power_intrinsics: use callbacks for comparison

2021-06-25 Thread Anatoly Burakov

Previously, the semantics of power monitor were such that we were
checking current value against the expected value, and if they matched,
then the sleep was aborted. This is somewhat inflexible, because it only
allowed us to check for a specific value.

This commit replaces the comparison with a user callback mechanism, so
that any PMD (or other code) using `rte_power_monitor()` can define
their own comparison semantics and decision making on how to detect the
need to abort the entering of power optimized state.

Existing implementations are adjusted to follow the new semantics.

Suggested-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Use callback mechanism for more flexibility
- Address feedback from Konstantin

 doc/guides/rel_notes/release_21_08.rst|  1 +
 drivers/event/dlb2/dlb2.c | 16 --
 drivers/net/i40e/i40e_rxtx.c  | 19 
 drivers/net/iavf/iavf_rxtx.c  | 19 
 drivers/net/ice/ice_rxtx.c| 19 
 drivers/net/ixgbe/ixgbe_rxtx.c| 19 
 drivers/net/mlx5/mlx5_rx.c| 16 --
 .../include/generic/rte_power_intrinsics.h| 29 ++-
 lib/eal/x86/rte_power_intrinsics.c|  9 ++
 9 files changed, 106 insertions(+), 41 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf3ce..c84ac280f5 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -84,6 +84,7 @@ API Changes
Also, make sure to start the actual text at the margin.
===
 
+* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
 ABI Changes
 ---
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index eca183753f..14dfac257c 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -3154,6 +3154,15 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num)
}
 }
 
+#define CLB_MASK_IDX 0
+#define CLB_VAL_IDX 1
+static int
+dlb2_monitor_callback(const uint64_t val, const uint64_t opaque[4])
+{
+   /* abort if the value matches */
+   return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0;
+}
+
 static inline int
 dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
  struct dlb2_eventdev_port *ev_port,
@@ -3194,8 +3203,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
expected_value = 0;
 
pmc.addr = monitor_addr;
-   pmc.val = expected_value;
-   pmc.mask = qe_mask.raw_qe[1];
+   /* store expected value and comparison mask in opaque data */
+   pmc.opaque[CLB_VAL_IDX] = expected_value;
+   pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
+   /* set up callback */
+   pmc.fn = dlb2_monitor_callback;
pmc.size = sizeof(uint64_t);
 
rte_power_monitor(&pmc, timeout + start_ticks);
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 6c58decece..45f3fbf4ec 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -81,6 +81,17 @@
 #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
(PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
 
+static int
+i40e_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unused)
+{
+   const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /*
+* we expect the DD bit to be set to 1 if this descriptor was already
+* written to.
+*/
+   return (value & m) == m ? -1 : 0;
+}
+
 int
 i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
 {
@@ -93,12 +104,8 @@ i40e_get_monitor_addr(void *rx_queue, struct 
rte_power_monitor_cond *pmc)
/* watch for changes in status bit */
pmc->addr = &rxdp->wb.qword1.status_error_len;
 
-   /*
-* we expect the DD bit to be set to 1 if this descriptor was already
-* written to.
-*/
-   pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
-   pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /* comparison callback */
+   pmc->fn = i40e_monitor_callback;
 
/* registers are 64-bit */
pmc->size = sizeof(uint64_t);
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0361af0d85..6e12ecce07 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -57,6 +57,17 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type)
rxdid_map[flex_type] : IAVF_RXDID_COMMS_OVS_1;
 }
 
+static int
+iavf_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unus

[dpdk-dev] [PATCH v2 2/7] net/af_xdp: add power monitor support

2021-06-25 Thread Anatoly Burakov

Implement support for .get_monitor_addr in AF_XDP driver.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Rewrite using the callback mechanism

 drivers/net/af_xdp/rte_eth_af_xdp.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c
index eb5660a3dc..8b9c89c3e8 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "compat.h"
 
@@ -788,6 +789,37 @@ eth_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+#define CLB_VAL_IDX 0
+static int
+eth_monitor_callback(const uint64_t value, const uint64_t opaque[4])
+{
+   const uint64_t v = opaque[CLB_VAL_IDX];
+   const uint64_t m = (uint32_t)~0;
+
+   /* if the value has changed, abort entering power optimized state */
+   return (value & m) == v ? 0 : -1;
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+   struct pkt_rx_queue *rxq = rx_queue;
+   unsigned int *prod = rxq->rx.producer;
+   const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
+
+   /* watch for changes in producer ring */
+   pmc->addr = (void*)prod;
+
+   /* store current value */
+   pmc->opaque[CLB_VAL_IDX] = cur_val;
+   pmc->fn = eth_monitor_callback;
+
+   /* AF_XDP producer ring index is 32-bit */
+   pmc->size = sizeof(uint32_t);
+
+   return 0;
+}
+
 static int
 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
@@ -1448,6 +1480,7 @@ static const struct eth_dev_ops ops = {
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
+   .get_monitor_addr = eth_get_monitor_addr
 };
 
 /** parse busy_budget argument */
-- 
2.25.1

[dpdk-dev] [PATCH v2 3/7] eal: add power monitor for multiple events

2021-06-25 Thread Anatoly Burakov

Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Adapt to callback mechanism

 doc/guides/rel_notes/release_21_08.rst|  2 +
 lib/eal/arm/rte_power_intrinsics.c| 11 +++
 lib/eal/include/generic/rte_cpuflags.h|  2 +
 .../include/generic/rte_power_intrinsics.h| 35 ++
 lib/eal/ppc/rte_power_intrinsics.c| 11 +++
 lib/eal/version.map   |  3 +
 lib/eal/x86/rte_cpuflags.c|  2 +
 lib/eal/x86/rte_power_intrinsics.c| 69 +++
 8 files changed, 135 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index c84ac280f5..9d1cfac395 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,8 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* eal: added ``rte_power_monitor_multi`` to support waiting for multiple 
events.
+
 
 Removed Items
 -
diff --git a/lib/eal/arm/rte_power_intrinsics.c 
b/lib/eal/arm/rte_power_intrinsics.c
index e83f04072a..78f55b7203 100644
--- a/lib/eal/arm/rte_power_intrinsics.c
+++ b/lib/eal/arm/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+   RTE_SET_USED(tsc_timestamp);
+
+   return -ENOTSUP;
+}
diff --git a/lib/eal/include/generic/rte_cpuflags.h 
b/lib/eal/include/generic/rte_cpuflags.h
index 28a5aecde8..d35551e931 100644
--- a/lib/eal/include/generic/rte_cpuflags.h
+++ b/lib/eal/include/generic/rte_cpuflags.h
@@ -24,6 +24,8 @@ struct rte_cpu_intrinsics {
/**< indicates support for rte_power_monitor function */
uint32_t power_pause : 1;
/**< indicates support for rte_power_pause function */
+   uint32_t power_monitor_multi : 1;
+   /**< indicates support for rte_power_monitor_multi function */
 };
 
 /**
diff --git a/lib/eal/include/generic/rte_power_intrinsics.h 
b/lib/eal/include/generic/rte_power_intrinsics.h
index 046667ade6..877fb282cb 100644
--- a/lib/eal/include/generic/rte_power_intrinsics.h
+++ b/lib/eal/include/generic/rte_power_intrinsics.h
@@ -124,4 +124,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id);
 __rte_experimental
 int rte_power_pause(const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Monitor a set of addresses for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either one of the specified
+ * memory addresses is written to, a certain TSC timestamp is reached, or other
+ * reasons cause the CPU to wake up.
+ *
+ * Additionally, `expected` 64-bit values and 64-bit masks are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they do not match, the entering 
of
+ * optimized power state may be aborted.
+ *
+ * @warning It is responsibility of the user to check if this function is
+ *   supported at runtime using `rte_cpu_get_intrinsics_support()` API call.
+ *   Failing to do so may result in an illegal CPU instruction error.
+ *
+ * @param pmc
+ *   An array of monitoring condition structures.
+ * @param num
+ *   Length of the `pmc` array.
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   0 on success
+ *   -EINVAL on invalid parameters
+ *   -ENOTSUP if unsupported
+ */
+__rte_experimental
+int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp);
+
 #endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/eal/ppc/rte_power_intrinsics.c 
b/lib/eal/ppc/rte_power_intrinsics.c
index 7fc9586da7..f00b58ade5 100644
--- a/lib/eal/ppc/rte_power_intrinsics.c
+++ b/lib/eal/ppc/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+

[dpdk-dev] [PATCH v2 4/7] power: remove thread safety from PMD power API's

2021-06-25 Thread Anatoly Burakov

Currently, we expect that only one callback can be active at any given
moment, for a particular queue configuration, which is relatively easy
to implement in a thread-safe way. However, we're about to add support
for multiple queues per lcore, which will greatly increase the
possibility of various race conditions.

We could have used something like an RCU for this use case, but absent
of a pressing need for thread safety we'll go the easy way and just
mandate that the API's are to be called when all affected ports are
stopped, and document this limitation. This greatly simplifies the
`rte_power_monitor`-related code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Add check for stopped queue
- Clarified doc message
- Added release notes

 doc/guides/rel_notes/release_21_08.rst |   5 +
 lib/power/meson.build  |   3 +
 lib/power/rte_power_pmd_mgmt.c | 133 ++---
 lib/power/rte_power_pmd_mgmt.h |   6 ++
 4 files changed, 67 insertions(+), 80 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index 9d1cfac395..f015c509fc 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -88,6 +88,11 @@ API Changes
 
 * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
+* rte_power: The experimental PMD power management API is no longer considered
+  to be thread safe; all Rx queues affected by the API will now need to be
+  stopped before making any changes to the power management scheme.
+
+
 ABI Changes
 ---
 
diff --git a/lib/power/meson.build b/lib/power/meson.build
index c1097d32f1..4f6a242364 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -21,4 +21,7 @@ headers = files(
 'rte_power_pmd_mgmt.h',
 'rte_power_guest_channel.h',
 )
+if cc.has_argument('-Wno-cast-qual')
+cflags += '-Wno-cast-qual'
+endif
 deps += ['timer', 'ethdev']
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..9b95cf1794 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -40,8 +40,6 @@ struct pmd_queue_cfg {
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
-   volatile bool umwait_in_progress;
-   /**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
 } __rte_cache_aligned;
@@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf 
**pkts __rte_unused,
struct rte_power_monitor_cond pmc;
uint16_t ret;
 
-   /*
-* we might get a cancellation request while being
-* inside the callback, in which case the wakeup
-* wouldn't work because it would've arrived too early.
-*
-* to get around this, we notify the other thread that
-* we're sleeping, so that it can spin until we're done.
-* unsolicited wakeups are perfectly safe.
-*/
-   q_conf->umwait_in_progress = true;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-   /* check if we need to cancel sleep */
-   if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-   /* use monitoring condition to sleep */
-   ret = rte_eth_get_monitor_addr(port_id, qidx,
-   &pmc);
-   if (ret == 0)
-   rte_power_monitor(&pmc, UINT64_MAX);
-   }
-   q_conf->umwait_in_progress = false;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+   /* use monitoring condition to sleep */
+   ret = rte_eth_get_monitor_addr(port_id, qidx,
+   &pmc);
+   if (ret == 0)
+   rte_power_monitor(&pmc, UINT64_MAX);
}
} else
q_conf->empty_poll_stats = 0;
@@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx,
return nb_rx;
 }
 
+static int
+queue_stopped(const uint16_t port_id, const uint16_t queue_id)
+{
+   struct rte_eth_rxq_info qinfo;
+
+   if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0)
+   return -1;
+
+   return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint

[dpdk-dev] [PATCH v2 6/7] power: support monitoring multiple Rx queues

2021-06-25 Thread Anatoly Burakov

Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
Rx queues while entering the energy efficient power state. The multi
version will be used unconditionally if supported, and the UMWAIT one
will only be used when multi-monitor is not supported by the hardware.

Signed-off-by: Anatoly Burakov 
---
 doc/guides/prog_guide/power_man.rst |  9 ++--
 lib/power/rte_power_pmd_mgmt.c  | 76 -
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index 38f876466a..defb61bdc4 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain 
number.
 The "monitor" mode is only supported in the following configurations and 
scenarios:
 
 * If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor_multi()`` function is supported by the platform, then
+  monitoring multiple Ethernet Rx queues for traffic will be supported.
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
   ``rte_power_monitor()`` is supported by the platform, then monitoring will be
   limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
   monitored from a different lcore).
 
-* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
-  ``rte_power_monitor()`` function is not supported, then monitor mode will not
-  be supported.
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of 
the
+  two monitoring functions are supported, then monitor mode will not be 
supported.
 
 * Not all Ethernet devices support monitoring, even if the underlying
   platform may support the necessary CPU instructions. Support for monitoring 
is
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index 7762cd39b8..aab2d4f1ee 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union 
queue *q)
return 0;
 }
 
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+   struct rte_power_monitor_cond *pmc)
+{
+   const struct queue_list_entry *qle;
+   size_t i = 0;
+   int ret;
+
+   TAILQ_FOREACH(qle, &cfg->head, next) {
+   struct rte_power_monitor_cond *cur = &pmc[i];
+   const union queue *q = &qle->queue;
+   ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+   if (ret < 0)
+   return ret;
+   }
+   return 0;
+}
+
 static void
 calc_tsc(void)
 {
@@ -183,6 +201,48 @@ calc_tsc(void)
}
 }
 
+static uint16_t
+clb_multiwait(uint16_t port_id, uint16_t qidx,
+   struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+   uint16_t max_pkts __rte_unused, void *addr __rte_unused)
+{
+   const unsigned int lcore = rte_lcore_id();
+   const union queue q = {.portid = port_id, .qid = qidx};
+   const bool empty = nb_rx == 0;
+   struct pmd_core_cfg *q_conf;
+
+   q_conf = &lcore_cfg[lcore];
+
+   /* early exit */
+   if (likely(!empty)) {
+   q_conf->empty_poll_stats = 0;
+   } else {
+   /* do we care about this particular queue? */
+   if (!queue_is_power_save(q_conf, &q))
+   return nb_rx;
+
+   /*
+* we can increment unconditionally here because if there were
+* non-empty polls in other queues assigned to this core, we
+* dropped the counter to zero anyway.
+*/
+   q_conf->empty_poll_stats++;
+   if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+   struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
+   uint16_t ret;
+
+   /* gather all monitoring conditions */
+   ret = get_monitor_addresses(q_conf, pmc);
+
+   if (ret == 0)
+   rte_power_monitor_multi(pmc,
+   q_conf->n_queues, UINT64_MAX);
+   }
+   }
+
+   return nb_rx;
+}
+
 static uint16_t
 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts 
__rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused,
@@ -348,14 +408,19 @@ static int
 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
 {
struct rte_power_monitor_cond dummy;
+   bool multimonitor_supported;
 
/* check if rte_power_monitor is supported */
if (!global_data.intrinsics_support.power_monitor) {
RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not 
supported\n");
ret

[dpdk-dev] [PATCH v2 5/7] power: support callbacks for multiple Rx queues

2021-06-25 Thread Anatoly Burakov

Currently, there is a hard limitation on the PMD power management
support that only allows it to support a single queue per lcore. This is
not ideal as most DPDK use cases will poll multiple queues per core.

The PMD power management mechanism relies on ethdev Rx callbacks, so it
is very difficult to implement such support because callbacks are
effectively stateless and have no visibility into what the other ethdev
devices are doing. This places limitations on what we can do within the
framework of Rx callbacks, but the basics of this implementation are as
follows:

- Replace per-queue structures with per-lcore ones, so that any device
  polled from the same lcore can share data
- Any queue that is going to be polled from a specific lcore has to be
  added to the list of cores to poll, so that the callback is aware of
  other queues being polled by the same lcore
- Both the empty poll counter and the actual power saving mechanism is
  shared between all queues polled on a particular lcore, and is only
  activated when a special designated "power saving" queue is polled. To
  put it another way, we have no idea which queue the user will poll in
  what order, so we rely on them telling us that queue X is the last one
  in the polling loop, so any power management should happen there.
- A new API is added to mark a specific Rx queue as "power saving".
  Failing to call this API will result in no power management, however
  when having only one queue per core it is obvious which queue is the
  "power saving" one, so things will still work without this new API for
  use cases that were previously working without it.
- The limitation on UMWAIT-based polling is not removed because UMWAIT
  is incapable of monitoring more than one address.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Use a TAILQ for queues instead of a static array
- Address feedback from Konstantin
- Add additional checks for stopped queues

 doc/guides/prog_guide/power_man.rst|  80 --
 doc/guides/rel_notes/release_21_08.rst |   3 +
 lib/power/rte_power_pmd_mgmt.c | 381 -
 lib/power/rte_power_pmd_mgmt.h |  34 +++
 lib/power/version.map  |   3 +
 5 files changed, 407 insertions(+), 94 deletions(-)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index c70ae128ac..38f876466a 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -198,34 +198,48 @@ Ethernet PMD Power Management API
 Abstract
 
 
-Existing power management mechanisms require developers
-to change application design or change code to make use of it.
-The PMD power management API provides a convenient alternative
-by utilizing Ethernet PMD RX callbacks,
-and triggering power saving whenever empty poll count reaches a certain number.
-
-Monitor
-   This power saving scheme will put the CPU into optimized power state
-   and use the ``rte_power_monitor()`` function
-   to monitor the Ethernet PMD RX descriptor address,
-   and wake the CPU up whenever there's new traffic.
-
-Pause
-   This power saving scheme will avoid busy polling
-   by either entering power-optimized sleep state
-   with ``rte_power_pause()`` function,
-   or, if it's not available, use ``rte_pause()``.
-
-Frequency scaling
-   This power saving scheme will use ``librte_power`` library
-   functionality to scale the core frequency up/down
-   depending on traffic volume.
-
-.. note::
-
-   Currently, this power management API is limited to mandatory mapping
-   of 1 queue to 1 core (multiple queues are supported,
-   but they must be polled from different cores).
+Existing power management mechanisms require developers to change application
+design or change code to make use of it. The PMD power management API provides 
a
+convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering
+power saving whenever empty poll count reaches a certain number.
+
+* Monitor
+   This power saving scheme will put the CPU into optimized power state and
+   monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever
+   there's new traffic. Support for this scheme may not be available on all
+   platforms, and further limitations may apply (see below).
+
+* Pause
+   This power saving scheme will avoid busy polling by either entering
+   power-optimized sleep state with ``rte_power_pause()`` function, or, if it's
+   not supported by the underlying platform, use ``rte_pause()``.
+
+* Frequency scaling
+   This power saving scheme will use ``librte_power`` library functionality to
+   scale the core frequency up/down depending on traffic volume.
+
+The "monitor" mode is only supported in the following configurations and 
scenarios:
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor()`` is supported by the platform, then monitoring w

[dpdk-dev] [PATCH v2 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes

2021-06-25 Thread Anatoly Burakov

Currently, l3fwd-power enforces the limitation of having one queue per
lcore. This is no longer necessary, so remove the limitation, and always
mark the last queue in qconf as the power save queue.

Signed-off-by: Anatoly Burakov 
---
 examples/l3fwd-power/main.c | 39 +++--
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f8dfed1634..3057c06936 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -2498,6 +2498,27 @@ mode_to_str(enum appmode mode)
}
 }
 
+static void
+pmd_pmgmt_set_up(unsigned int lcore, uint16_t portid, uint16_t qid, bool last)
+{
+   int ret;
+
+   ret = rte_power_ethdev_pmgmt_queue_enable(lcore, portid,
+   qid, pmgmt_type);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_power_ethdev_pmgmt_queue_enable: err=%d, 
port=%d\n",
+   ret, portid);
+
+   if (!last)
+   return;
+   ret = rte_power_ethdev_pmgmt_queue_set_power_save(lcore, portid, qid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_power_ethdev_pmgmt_queue_set_power_save: err=%d, 
port=%d\n",
+   ret, portid);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2723,12 +2744,6 @@ main(int argc, char **argv)
printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
fflush(stdout);
 
-   /* PMD power management mode can only do 1 queue per core */
-   if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) {
-   rte_exit(EXIT_FAILURE,
-   "In PMD power management mode, only one queue 
per lcore is allowed\n");
-   }
-
/* init RX queues */
for(queue = 0; queue < qconf->n_rx_queue; ++queue) {
struct rte_eth_rxconf rxq_conf;
@@ -2767,15 +2782,9 @@ main(int argc, char **argv)
 "Fail to add ptype cb\n");
}
 
-   if (app_mode == APP_MODE_PMD_MGMT) {
-   ret = rte_power_ethdev_pmgmt_queue_enable(
-   lcore_id, portid, queueid,
-   pmgmt_type);
-   if (ret < 0)
-   rte_exit(EXIT_FAILURE,
-   
"rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n",
-   ret, portid);
-   }
+   if (app_mode == APP_MODE_PMD_MGMT)
+   pmd_pmgmt_set_up(lcore_id, portid, queueid,
+   queue == (qconf->n_rx_queue - 1));
}
}
 
-- 
2.25.1

[dpdk-dev] [PATCH v3 0/7] Enhancements for PMD power management

2021-06-28 Thread Anatoly Burakov

This patchset introduces several changes related to PMD power management:

- Changed monitoring intrinsics to use callbacks as a comparison function, based
  on previous patchset [1] but incorporating feedback [2] - this hopefully will
  make it possible to add support for .get_monitor_addr in virtio
- Add a new intrinsic to monitor multiple addresses, based on RTM instruction
  set and the TPAUSE instruction
- Add support for PMD power management on multiple queues, as well as all
  accompanying infrastructure and example apps changes

v3:
- Moved some doc updates to NIC features list

v2:
- Changed check inversion to callbacks
- Addressed feedback from Konstantin
- Added doc updates where necessary

[1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=*
[2] 
http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274

Anatoly Burakov (7):
  power_intrinsics: use callbacks for comparison
  net/af_xdp: add power monitor support
  eal: add power monitor for multiple events
  power: remove thread safety from PMD power API's
  power: support callbacks for multiple Rx queues
  power: support monitoring multiple Rx queues
  l3fwd-power: support multiqueue in PMD pmgmt modes

 doc/guides/nics/features.rst  |  10 +
 doc/guides/prog_guide/power_man.rst   |  78 ++-
 doc/guides/rel_notes/release_21_08.rst|  11 +
 drivers/event/dlb2/dlb2.c |  16 +-
 drivers/net/af_xdp/rte_eth_af_xdp.c   |  33 +
 drivers/net/i40e/i40e_rxtx.c  |  19 +-
 drivers/net/iavf/iavf_rxtx.c  |  19 +-
 drivers/net/ice/ice_rxtx.c|  19 +-
 drivers/net/ixgbe/ixgbe_rxtx.c|  19 +-
 drivers/net/mlx5/mlx5_rx.c|  16 +-
 examples/l3fwd-power/main.c   |  39 +-
 lib/eal/arm/rte_power_intrinsics.c|  11 +
 lib/eal/include/generic/rte_cpuflags.h|   2 +
 .../include/generic/rte_power_intrinsics.h|  64 +-
 lib/eal/ppc/rte_power_intrinsics.c|  11 +
 lib/eal/version.map   |   3 +
 lib/eal/x86/rte_cpuflags.c|   2 +
 lib/eal/x86/rte_power_intrinsics.c|  78 ++-
 lib/power/meson.build |   3 +
 lib/power/rte_power_pmd_mgmt.c| 574 +-
 lib/power/rte_power_pmd_mgmt.h|  40 ++
 lib/power/version.map |   3 +
 22 files changed, 846 insertions(+), 224 deletions(-)

-- 
2.25.1

[dpdk-dev] [PATCH v3 1/7] power_intrinsics: use callbacks for comparison

2021-06-28 Thread Anatoly Burakov

Previously, the semantics of power monitor were such that we were
checking current value against the expected value, and if they matched,
then the sleep was aborted. This is somewhat inflexible, because it only
allowed us to check for a specific value.

This commit replaces the comparison with a user callback mechanism, so
that any PMD (or other code) using `rte_power_monitor()` can define
their own comparison semantics and decision making on how to detect the
need to abort the entering of power optimized state.

Existing implementations are adjusted to follow the new semantics.

Suggested-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Use callback mechanism for more flexibility
- Address feedback from Konstantin

 doc/guides/rel_notes/release_21_08.rst|  1 +
 drivers/event/dlb2/dlb2.c | 16 --
 drivers/net/i40e/i40e_rxtx.c  | 19 
 drivers/net/iavf/iavf_rxtx.c  | 19 
 drivers/net/ice/ice_rxtx.c| 19 
 drivers/net/ixgbe/ixgbe_rxtx.c| 19 
 drivers/net/mlx5/mlx5_rx.c| 16 --
 .../include/generic/rte_power_intrinsics.h| 29 ++-
 lib/eal/x86/rte_power_intrinsics.c|  9 ++
 9 files changed, 106 insertions(+), 41 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf3ce..c84ac280f5 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -84,6 +84,7 @@ API Changes
Also, make sure to start the actual text at the margin.
===
 
+* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
 ABI Changes
 ---
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index eca183753f..14dfac257c 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -3154,6 +3154,15 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num)
}
 }
 
+#define CLB_MASK_IDX 0
+#define CLB_VAL_IDX 1
+static int
+dlb2_monitor_callback(const uint64_t val, const uint64_t opaque[4])
+{
+   /* abort if the value matches */
+   return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0;
+}
+
 static inline int
 dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
  struct dlb2_eventdev_port *ev_port,
@@ -3194,8 +3203,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
expected_value = 0;
 
pmc.addr = monitor_addr;
-   pmc.val = expected_value;
-   pmc.mask = qe_mask.raw_qe[1];
+   /* store expected value and comparison mask in opaque data */
+   pmc.opaque[CLB_VAL_IDX] = expected_value;
+   pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
+   /* set up callback */
+   pmc.fn = dlb2_monitor_callback;
pmc.size = sizeof(uint64_t);
 
rte_power_monitor(&pmc, timeout + start_ticks);
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 6c58decece..45f3fbf4ec 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -81,6 +81,17 @@
 #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
(PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
 
+static int
+i40e_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unused)
+{
+   const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /*
+* we expect the DD bit to be set to 1 if this descriptor was already
+* written to.
+*/
+   return (value & m) == m ? -1 : 0;
+}
+
 int
 i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
 {
@@ -93,12 +104,8 @@ i40e_get_monitor_addr(void *rx_queue, struct 
rte_power_monitor_cond *pmc)
/* watch for changes in status bit */
pmc->addr = &rxdp->wb.qword1.status_error_len;
 
-   /*
-* we expect the DD bit to be set to 1 if this descriptor was already
-* written to.
-*/
-   pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
-   pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /* comparison callback */
+   pmc->fn = i40e_monitor_callback;
 
/* registers are 64-bit */
pmc->size = sizeof(uint64_t);
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0361af0d85..6e12ecce07 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -57,6 +57,17 @@ iavf_proto_xtr_type_to_rxdid(uint8_t flex_type)
rxdid_map[flex_type] : IAVF_RXDID_COMMS_OVS_1;
 }
 
+static int
+iavf_monitor_callback(const uint64_t value, const uint64_t arg[4] __rte_unus

[dpdk-dev] [PATCH v3 2/7] net/af_xdp: add power monitor support

2021-06-28 Thread Anatoly Burakov

Implement support for .get_monitor_addr in AF_XDP driver.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Rewrite using the callback mechanism

 drivers/net/af_xdp/rte_eth_af_xdp.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c
index eb5660a3dc..8b9c89c3e8 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "compat.h"
 
@@ -788,6 +789,37 @@ eth_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+#define CLB_VAL_IDX 0
+static int
+eth_monitor_callback(const uint64_t value, const uint64_t opaque[4])
+{
+   const uint64_t v = opaque[CLB_VAL_IDX];
+   const uint64_t m = (uint32_t)~0;
+
+   /* if the value has changed, abort entering power optimized state */
+   return (value & m) == v ? 0 : -1;
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+   struct pkt_rx_queue *rxq = rx_queue;
+   unsigned int *prod = rxq->rx.producer;
+   const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
+
+   /* watch for changes in producer ring */
+   pmc->addr = (void*)prod;
+
+   /* store current value */
+   pmc->opaque[CLB_VAL_IDX] = cur_val;
+   pmc->fn = eth_monitor_callback;
+
+   /* AF_XDP producer ring index is 32-bit */
+   pmc->size = sizeof(uint32_t);
+
+   return 0;
+}
+
 static int
 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
@@ -1448,6 +1480,7 @@ static const struct eth_dev_ops ops = {
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
+   .get_monitor_addr = eth_get_monitor_addr
 };
 
 /** parse busy_budget argument */
-- 
2.25.1

[dpdk-dev] [PATCH v3 3/7] eal: add power monitor for multiple events

2021-06-28 Thread Anatoly Burakov

Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Adapt to callback mechanism

 doc/guides/rel_notes/release_21_08.rst|  2 +
 lib/eal/arm/rte_power_intrinsics.c| 11 +++
 lib/eal/include/generic/rte_cpuflags.h|  2 +
 .../include/generic/rte_power_intrinsics.h| 35 ++
 lib/eal/ppc/rte_power_intrinsics.c| 11 +++
 lib/eal/version.map   |  3 +
 lib/eal/x86/rte_cpuflags.c|  2 +
 lib/eal/x86/rte_power_intrinsics.c| 69 +++
 8 files changed, 135 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index c84ac280f5..9d1cfac395 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,8 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* eal: added ``rte_power_monitor_multi`` to support waiting for multiple 
events.
+
 
 Removed Items
 -
diff --git a/lib/eal/arm/rte_power_intrinsics.c 
b/lib/eal/arm/rte_power_intrinsics.c
index e83f04072a..78f55b7203 100644
--- a/lib/eal/arm/rte_power_intrinsics.c
+++ b/lib/eal/arm/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+   RTE_SET_USED(tsc_timestamp);
+
+   return -ENOTSUP;
+}
diff --git a/lib/eal/include/generic/rte_cpuflags.h 
b/lib/eal/include/generic/rte_cpuflags.h
index 28a5aecde8..d35551e931 100644
--- a/lib/eal/include/generic/rte_cpuflags.h
+++ b/lib/eal/include/generic/rte_cpuflags.h
@@ -24,6 +24,8 @@ struct rte_cpu_intrinsics {
/**< indicates support for rte_power_monitor function */
uint32_t power_pause : 1;
/**< indicates support for rte_power_pause function */
+   uint32_t power_monitor_multi : 1;
+   /**< indicates support for rte_power_monitor_multi function */
 };
 
 /**
diff --git a/lib/eal/include/generic/rte_power_intrinsics.h 
b/lib/eal/include/generic/rte_power_intrinsics.h
index 046667ade6..877fb282cb 100644
--- a/lib/eal/include/generic/rte_power_intrinsics.h
+++ b/lib/eal/include/generic/rte_power_intrinsics.h
@@ -124,4 +124,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id);
 __rte_experimental
 int rte_power_pause(const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Monitor a set of addresses for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either one of the specified
+ * memory addresses is written to, a certain TSC timestamp is reached, or other
+ * reasons cause the CPU to wake up.
+ *
+ * Additionally, `expected` 64-bit values and 64-bit masks are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they do not match, the entering 
of
+ * optimized power state may be aborted.
+ *
+ * @warning It is responsibility of the user to check if this function is
+ *   supported at runtime using `rte_cpu_get_intrinsics_support()` API call.
+ *   Failing to do so may result in an illegal CPU instruction error.
+ *
+ * @param pmc
+ *   An array of monitoring condition structures.
+ * @param num
+ *   Length of the `pmc` array.
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   0 on success
+ *   -EINVAL on invalid parameters
+ *   -ENOTSUP if unsupported
+ */
+__rte_experimental
+int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp);
+
 #endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/eal/ppc/rte_power_intrinsics.c 
b/lib/eal/ppc/rte_power_intrinsics.c
index 7fc9586da7..f00b58ade5 100644
--- a/lib/eal/ppc/rte_power_intrinsics.c
+++ b/lib/eal/ppc/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+

[dpdk-dev] [PATCH v3 4/7] power: remove thread safety from PMD power API's

2021-06-28 Thread Anatoly Burakov

Currently, we expect that only one callback can be active at any given
moment, for a particular queue configuration, which is relatively easy
to implement in a thread-safe way. However, we're about to add support
for multiple queues per lcore, which will greatly increase the
possibility of various race conditions.

We could have used something like an RCU for this use case, but absent
of a pressing need for thread safety we'll go the easy way and just
mandate that the API's are to be called when all affected ports are
stopped, and document this limitation. This greatly simplifies the
`rte_power_monitor`-related code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Add check for stopped queue
- Clarified doc message
- Added release notes

 doc/guides/rel_notes/release_21_08.rst |   5 +
 lib/power/meson.build  |   3 +
 lib/power/rte_power_pmd_mgmt.c | 133 ++---
 lib/power/rte_power_pmd_mgmt.h |   6 ++
 4 files changed, 67 insertions(+), 80 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index 9d1cfac395..f015c509fc 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -88,6 +88,11 @@ API Changes
 
 * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
+* rte_power: The experimental PMD power management API is no longer considered
+  to be thread safe; all Rx queues affected by the API will now need to be
+  stopped before making any changes to the power management scheme.
+
+
 ABI Changes
 ---
 
diff --git a/lib/power/meson.build b/lib/power/meson.build
index c1097d32f1..4f6a242364 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -21,4 +21,7 @@ headers = files(
 'rte_power_pmd_mgmt.h',
 'rte_power_guest_channel.h',
 )
+if cc.has_argument('-Wno-cast-qual')
+cflags += '-Wno-cast-qual'
+endif
 deps += ['timer', 'ethdev']
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..9b95cf1794 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -40,8 +40,6 @@ struct pmd_queue_cfg {
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
-   volatile bool umwait_in_progress;
-   /**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
 } __rte_cache_aligned;
@@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf 
**pkts __rte_unused,
struct rte_power_monitor_cond pmc;
uint16_t ret;
 
-   /*
-* we might get a cancellation request while being
-* inside the callback, in which case the wakeup
-* wouldn't work because it would've arrived too early.
-*
-* to get around this, we notify the other thread that
-* we're sleeping, so that it can spin until we're done.
-* unsolicited wakeups are perfectly safe.
-*/
-   q_conf->umwait_in_progress = true;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-   /* check if we need to cancel sleep */
-   if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-   /* use monitoring condition to sleep */
-   ret = rte_eth_get_monitor_addr(port_id, qidx,
-   &pmc);
-   if (ret == 0)
-   rte_power_monitor(&pmc, UINT64_MAX);
-   }
-   q_conf->umwait_in_progress = false;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+   /* use monitoring condition to sleep */
+   ret = rte_eth_get_monitor_addr(port_id, qidx,
+   &pmc);
+   if (ret == 0)
+   rte_power_monitor(&pmc, UINT64_MAX);
}
} else
q_conf->empty_poll_stats = 0;
@@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx,
return nb_rx;
 }
 
+static int
+queue_stopped(const uint16_t port_id, const uint16_t queue_id)
+{
+   struct rte_eth_rxq_info qinfo;
+
+   if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0)
+   return -1;
+
+   return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint

[dpdk-dev] [PATCH v3 5/7] power: support callbacks for multiple Rx queues

2021-06-28 Thread Anatoly Burakov

Currently, there is a hard limitation on the PMD power management
support that only allows it to support a single queue per lcore. This is
not ideal as most DPDK use cases will poll multiple queues per core.

The PMD power management mechanism relies on ethdev Rx callbacks, so it
is very difficult to implement such support because callbacks are
effectively stateless and have no visibility into what the other ethdev
devices are doing. This places limitations on what we can do within the
framework of Rx callbacks, but the basics of this implementation are as
follows:

- Replace per-queue structures with per-lcore ones, so that any device
  polled from the same lcore can share data
- Any queue that is going to be polled from a specific lcore has to be
  added to the list of cores to poll, so that the callback is aware of
  other queues being polled by the same lcore
- Both the empty poll counter and the actual power saving mechanism is
  shared between all queues polled on a particular lcore, and is only
  activated when a special designated "power saving" queue is polled. To
  put it another way, we have no idea which queue the user will poll in
  what order, so we rely on them telling us that queue X is the last one
  in the polling loop, so any power management should happen there.
- A new API is added to mark a specific Rx queue as "power saving".
  Failing to call this API will result in no power management, however
  when having only one queue per core it is obvious which queue is the
  "power saving" one, so things will still work without this new API for
  use cases that were previously working without it.
- The limitation on UMWAIT-based polling is not removed because UMWAIT
  is incapable of monitoring more than one address.

Also, while we're at it, update and improve the docs.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Move the list of supported NICs to NIC feature table

v2:
- Use a TAILQ for queues instead of a static array
- Address feedback from Konstantin
- Add additional checks for stopped queues

 doc/guides/nics/features.rst   |  10 +
 doc/guides/prog_guide/power_man.rst|  75 +++--
 doc/guides/rel_notes/release_21_08.rst |   3 +
 lib/power/rte_power_pmd_mgmt.c | 381 -
 lib/power/rte_power_pmd_mgmt.h |  34 +++
 lib/power/version.map  |   3 +
 6 files changed, 412 insertions(+), 94 deletions(-)

diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index 403c2b03a3..a96e12d155 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information.
 * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``.
 * **[related] API**: ``rte_eth_rx_burst_mode_get()``, 
``rte_eth_tx_burst_mode_get()``.
 
+.. _nic_features_get_monitor_addr:
+
+PMD power management using monitor addresses
+
+
+Supports getting a monitoring condition to use together with Ethernet PMD power
+management (see :doc:`../prog_guide/power_man` for more details).
+
+* **[implements] eth_dev_ops**: ``get_monitor_addr``
+
 .. _nic_features_other:
 
 Other dev ops not represented by a Feature
diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index c70ae128ac..fac2c19516 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -198,34 +198,41 @@ Ethernet PMD Power Management API
 Abstract
 
 
-Existing power management mechanisms require developers
-to change application design or change code to make use of it.
-The PMD power management API provides a convenient alternative
-by utilizing Ethernet PMD RX callbacks,
-and triggering power saving whenever empty poll count reaches a certain number.
-
-Monitor
-   This power saving scheme will put the CPU into optimized power state
-   and use the ``rte_power_monitor()`` function
-   to monitor the Ethernet PMD RX descriptor address,
-   and wake the CPU up whenever there's new traffic.
-
-Pause
-   This power saving scheme will avoid busy polling
-   by either entering power-optimized sleep state
-   with ``rte_power_pause()`` function,
-   or, if it's not available, use ``rte_pause()``.
-
-Frequency scaling
-   This power saving scheme will use ``librte_power`` library
-   functionality to scale the core frequency up/down
-   depending on traffic volume.
-
-.. note::
-
-   Currently, this power management API is limited to mandatory mapping
-   of 1 queue to 1 core (multiple queues are supported,
-   but they must be polled from different cores).
+Existing power management mechanisms require developers to change application
+design or change code to make use of it. The PMD power management API provides 
a
+convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering
+power savin

[dpdk-dev] [PATCH v3 6/7] power: support monitoring multiple Rx queues

2021-06-28 Thread Anatoly Burakov

Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
Rx queues while entering the energy efficient power state. The multi
version will be used unconditionally if supported, and the UMWAIT one
will only be used when multi-monitor is not supported by the hardware.

Signed-off-by: Anatoly Burakov 
---
 doc/guides/prog_guide/power_man.rst |  9 ++--
 lib/power/rte_power_pmd_mgmt.c  | 76 -
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index fac2c19516..3245a5ebed 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain 
number.
 The "monitor" mode is only supported in the following configurations and 
scenarios:
 
 * If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor_multi()`` function is supported by the platform, then
+  monitoring multiple Ethernet Rx queues for traffic will be supported.
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
   ``rte_power_monitor()`` is supported by the platform, then monitoring will be
   limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
   monitored from a different lcore).
 
-* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
-  ``rte_power_monitor()`` function is not supported, then monitor mode will not
-  be supported.
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of 
the
+  two monitoring functions are supported, then monitor mode will not be 
supported.
 
 * Not all Ethernet devices support monitoring, even if the underlying
   platform may support the necessary CPU instructions. Please refer to
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index 7762cd39b8..aab2d4f1ee 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -155,6 +155,24 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union 
queue *q)
return 0;
 }
 
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+   struct rte_power_monitor_cond *pmc)
+{
+   const struct queue_list_entry *qle;
+   size_t i = 0;
+   int ret;
+
+   TAILQ_FOREACH(qle, &cfg->head, next) {
+   struct rte_power_monitor_cond *cur = &pmc[i];
+   const union queue *q = &qle->queue;
+   ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+   if (ret < 0)
+   return ret;
+   }
+   return 0;
+}
+
 static void
 calc_tsc(void)
 {
@@ -183,6 +201,48 @@ calc_tsc(void)
}
 }
 
+static uint16_t
+clb_multiwait(uint16_t port_id, uint16_t qidx,
+   struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+   uint16_t max_pkts __rte_unused, void *addr __rte_unused)
+{
+   const unsigned int lcore = rte_lcore_id();
+   const union queue q = {.portid = port_id, .qid = qidx};
+   const bool empty = nb_rx == 0;
+   struct pmd_core_cfg *q_conf;
+
+   q_conf = &lcore_cfg[lcore];
+
+   /* early exit */
+   if (likely(!empty)) {
+   q_conf->empty_poll_stats = 0;
+   } else {
+   /* do we care about this particular queue? */
+   if (!queue_is_power_save(q_conf, &q))
+   return nb_rx;
+
+   /*
+* we can increment unconditionally here because if there were
+* non-empty polls in other queues assigned to this core, we
+* dropped the counter to zero anyway.
+*/
+   q_conf->empty_poll_stats++;
+   if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+   struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
+   uint16_t ret;
+
+   /* gather all monitoring conditions */
+   ret = get_monitor_addresses(q_conf, pmc);
+
+   if (ret == 0)
+   rte_power_monitor_multi(pmc,
+   q_conf->n_queues, UINT64_MAX);
+   }
+   }
+
+   return nb_rx;
+}
+
 static uint16_t
 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts 
__rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused,
@@ -348,14 +408,19 @@ static int
 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
 {
struct rte_power_monitor_cond dummy;
+   bool multimonitor_supported;
 
/* check if rte_power_monitor is supported */
if (!global_data.intrinsics_support.power_monitor) {
RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not 
supported\n");
ret

[dpdk-dev] [PATCH v3 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes

2021-06-28 Thread Anatoly Burakov

Currently, l3fwd-power enforces the limitation of having one queue per
lcore. This is no longer necessary, so remove the limitation, and always
mark the last queue in qconf as the power save queue.

Signed-off-by: Anatoly Burakov 
---
 examples/l3fwd-power/main.c | 39 +++--
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f8dfed1634..3057c06936 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -2498,6 +2498,27 @@ mode_to_str(enum appmode mode)
}
 }
 
+static void
+pmd_pmgmt_set_up(unsigned int lcore, uint16_t portid, uint16_t qid, bool last)
+{
+   int ret;
+
+   ret = rte_power_ethdev_pmgmt_queue_enable(lcore, portid,
+   qid, pmgmt_type);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_power_ethdev_pmgmt_queue_enable: err=%d, 
port=%d\n",
+   ret, portid);
+
+   if (!last)
+   return;
+   ret = rte_power_ethdev_pmgmt_queue_set_power_save(lcore, portid, qid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_power_ethdev_pmgmt_queue_set_power_save: err=%d, 
port=%d\n",
+   ret, portid);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2723,12 +2744,6 @@ main(int argc, char **argv)
printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
fflush(stdout);
 
-   /* PMD power management mode can only do 1 queue per core */
-   if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) {
-   rte_exit(EXIT_FAILURE,
-   "In PMD power management mode, only one queue 
per lcore is allowed\n");
-   }
-
/* init RX queues */
for(queue = 0; queue < qconf->n_rx_queue; ++queue) {
struct rte_eth_rxconf rxq_conf;
@@ -2767,15 +2782,9 @@ main(int argc, char **argv)
 "Fail to add ptype cb\n");
}
 
-   if (app_mode == APP_MODE_PMD_MGMT) {
-   ret = rte_power_ethdev_pmgmt_queue_enable(
-   lcore_id, portid, queueid,
-   pmgmt_type);
-   if (ret < 0)
-   rte_exit(EXIT_FAILURE,
-   
"rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n",
-   ret, portid);
-   }
+   if (app_mode == APP_MODE_PMD_MGMT)
+   pmd_pmgmt_set_up(lcore_id, portid, queueid,
+   queue == (qconf->n_rx_queue - 1));
}
}
 
-- 
2.25.1

[dpdk-dev] [PATCH v4 0/7] Enhancements for PMD power management

2021-06-28 Thread Anatoly Burakov

This patchset introduces several changes related to PMD power management:

- Changed monitoring intrinsics to use callbacks as a comparison function, based
  on previous patchset [1] but incorporating feedback [2] - this hopefully will
  make it possible to add support for .get_monitor_addr in virtio
- Add a new intrinsic to monitor multiple addresses, based on RTM instruction
  set and the TPAUSE instruction
- Add support for PMD power management on multiple queues, as well as all
  accompanying infrastructure and example apps changes

v4:
- Replaced raw number with a macro
- Fixed all the bugs found by Konstantin
- Some other minor corrections

v3:
- Moved some doc updates to NIC features list

v2:
- Changed check inversion to callbacks
- Addressed feedback from Konstantin
- Added doc updates where necessary

[1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=*
[2] 
http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274

Anatoly Burakov (7):
  power_intrinsics: use callbacks for comparison
  net/af_xdp: add power monitor support
  eal: add power monitor for multiple events
  power: remove thread safety from PMD power API's
  power: support callbacks for multiple Rx queues
  power: support monitoring multiple Rx queues
  l3fwd-power: support multiqueue in PMD pmgmt modes

 doc/guides/nics/features.rst  |  10 +
 doc/guides/prog_guide/power_man.rst   |  78 ++-
 doc/guides/rel_notes/release_21_08.rst|  11 +
 drivers/event/dlb2/dlb2.c |  17 +-
 drivers/net/af_xdp/rte_eth_af_xdp.c   |  34 +
 drivers/net/i40e/i40e_rxtx.c  |  20 +-
 drivers/net/iavf/iavf_rxtx.c  |  20 +-
 drivers/net/ice/ice_rxtx.c|  20 +-
 drivers/net/ixgbe/ixgbe_rxtx.c|  20 +-
 drivers/net/mlx5/mlx5_rx.c|  17 +-
 examples/l3fwd-power/main.c   |  39 +-
 lib/eal/arm/rte_power_intrinsics.c|  11 +
 lib/eal/include/generic/rte_cpuflags.h|   2 +
 .../include/generic/rte_power_intrinsics.h|  68 +-
 lib/eal/ppc/rte_power_intrinsics.c|  11 +
 lib/eal/version.map   |   3 +
 lib/eal/x86/rte_cpuflags.c|   2 +
 lib/eal/x86/rte_power_intrinsics.c|  90 ++-
 lib/power/meson.build |   3 +
 lib/power/rte_power_pmd_mgmt.c| 582 +-
 lib/power/rte_power_pmd_mgmt.h|  40 ++
 lib/power/version.map |   3 +
 22 files changed, 874 insertions(+), 227 deletions(-)

-- 
2.25.1

[dpdk-dev] [PATCH v4 1/7] power_intrinsics: use callbacks for comparison

2021-06-28 Thread Anatoly Burakov

Previously, the semantics of power monitor were such that we were
checking current value against the expected value, and if they matched,
then the sleep was aborted. This is somewhat inflexible, because it only
allowed us to check for a specific value in a specific way.

This commit replaces the comparison with a user callback mechanism, so
that any PMD (or other code) using `rte_power_monitor()` can define
their own comparison semantics and decision making on how to detect the
need to abort the entering of power optimized state.

Existing implementations are adjusted to follow the new semantics.

Suggested-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
Acked-by: Konstantin Ananyev 
---

Notes:
v4:
- Return error if callback is set to NULL
- Replace raw number with a macro in monitor condition opaque data

v2:
- Use callback mechanism for more flexibility
- Address feedback from Konstantin

 doc/guides/rel_notes/release_21_08.rst|  1 +
 drivers/event/dlb2/dlb2.c | 17 --
 drivers/net/i40e/i40e_rxtx.c  | 20 +++
 drivers/net/iavf/iavf_rxtx.c  | 20 +++
 drivers/net/ice/ice_rxtx.c| 20 +++
 drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++
 drivers/net/mlx5/mlx5_rx.c| 17 --
 .../include/generic/rte_power_intrinsics.h| 33 +++
 lib/eal/x86/rte_power_intrinsics.c| 17 +-
 9 files changed, 121 insertions(+), 44 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf3ce..c84ac280f5 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -84,6 +84,7 @@ API Changes
Also, make sure to start the actual text at the margin.
===
 
+* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
 ABI Changes
 ---
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index eca183753f..252bbd8d5e 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num)
}
 }
 
+#define CLB_MASK_IDX 0
+#define CLB_VAL_IDX 1
+static int
+dlb2_monitor_callback(const uint64_t val,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   /* abort if the value matches */
+   return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0;
+}
+
 static inline int
 dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
  struct dlb2_eventdev_port *ev_port,
@@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
expected_value = 0;
 
pmc.addr = monitor_addr;
-   pmc.val = expected_value;
-   pmc.mask = qe_mask.raw_qe[1];
+   /* store expected value and comparison mask in opaque data */
+   pmc.opaque[CLB_VAL_IDX] = expected_value;
+   pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
+   /* set up callback */
+   pmc.fn = dlb2_monitor_callback;
pmc.size = sizeof(uint64_t);
 
rte_power_monitor(&pmc, timeout + start_ticks);
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 6c58decece..081682f88b 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -81,6 +81,18 @@
 #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
(PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
 
+static int
+i40e_monitor_callback(const uint64_t value,
+   const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused)
+{
+   const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /*
+* we expect the DD bit to be set to 1 if this descriptor was already
+* written to.
+*/
+   return (value & m) == m ? -1 : 0;
+}
+
 int
 i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
 {
@@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct 
rte_power_monitor_cond *pmc)
/* watch for changes in status bit */
pmc->addr = &rxdp->wb.qword1.status_error_len;
 
-   /*
-* we expect the DD bit to be set to 1 if this descriptor was already
-* written to.
-*/
-   pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
-   pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /* comparison callback */
+   pmc->fn = i40e_monitor_callback;
 
/* registers are 64-bit */
pmc->size = sizeof(uint64_t);
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0361af0d85..7ed196ec22 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rx

[dpdk-dev] [PATCH v4 2/7] net/af_xdp: add power monitor support

2021-06-28 Thread Anatoly Burakov

Implement support for .get_monitor_addr in AF_XDP driver.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Rewrite using the callback mechanism

 drivers/net/af_xdp/rte_eth_af_xdp.c | 34 +
 1 file changed, 34 insertions(+)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c
index eb5660a3dc..7830d0c23a 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "compat.h"
 
@@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+#define CLB_VAL_IDX 0
+static int
+eth_monitor_callback(const uint64_t value,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   const uint64_t v = opaque[CLB_VAL_IDX];
+   const uint64_t m = (uint32_t)~0;
+
+   /* if the value has changed, abort entering power optimized state */
+   return (value & m) == v ? 0 : -1;
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+   struct pkt_rx_queue *rxq = rx_queue;
+   unsigned int *prod = rxq->rx.producer;
+   const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
+
+   /* watch for changes in producer ring */
+   pmc->addr = (void*)prod;
+
+   /* store current value */
+   pmc->opaque[CLB_VAL_IDX] = cur_val;
+   pmc->fn = eth_monitor_callback;
+
+   /* AF_XDP producer ring index is 32-bit */
+   pmc->size = sizeof(uint32_t);
+
+   return 0;
+}
+
 static int
 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
@@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = {
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
+   .get_monitor_addr = eth_get_monitor_addr
 };
 
 /** parse busy_budget argument */
-- 
2.25.1

[dpdk-dev] [PATCH v4 3/7] eal: add power monitor for multiple events

2021-06-28 Thread Anatoly Burakov

Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fixed bugs in accessing the monitor condition
- Abort on any monitor condition not having a defined callback

v2:
- Adapt to callback mechanism

 doc/guides/rel_notes/release_21_08.rst|  2 +
 lib/eal/arm/rte_power_intrinsics.c| 11 +++
 lib/eal/include/generic/rte_cpuflags.h|  2 +
 .../include/generic/rte_power_intrinsics.h| 35 +
 lib/eal/ppc/rte_power_intrinsics.c| 11 +++
 lib/eal/version.map   |  3 +
 lib/eal/x86/rte_cpuflags.c|  2 +
 lib/eal/x86/rte_power_intrinsics.c| 73 +++
 8 files changed, 139 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index c84ac280f5..9d1cfac395 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,8 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* eal: added ``rte_power_monitor_multi`` to support waiting for multiple 
events.
+
 
 Removed Items
 -
diff --git a/lib/eal/arm/rte_power_intrinsics.c 
b/lib/eal/arm/rte_power_intrinsics.c
index e83f04072a..78f55b7203 100644
--- a/lib/eal/arm/rte_power_intrinsics.c
+++ b/lib/eal/arm/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+   RTE_SET_USED(tsc_timestamp);
+
+   return -ENOTSUP;
+}
diff --git a/lib/eal/include/generic/rte_cpuflags.h 
b/lib/eal/include/generic/rte_cpuflags.h
index 28a5aecde8..d35551e931 100644
--- a/lib/eal/include/generic/rte_cpuflags.h
+++ b/lib/eal/include/generic/rte_cpuflags.h
@@ -24,6 +24,8 @@ struct rte_cpu_intrinsics {
/**< indicates support for rte_power_monitor function */
uint32_t power_pause : 1;
/**< indicates support for rte_power_pause function */
+   uint32_t power_monitor_multi : 1;
+   /**< indicates support for rte_power_monitor_multi function */
 };
 
 /**
diff --git a/lib/eal/include/generic/rte_power_intrinsics.h 
b/lib/eal/include/generic/rte_power_intrinsics.h
index c9aa52a86d..04e8c2ab37 100644
--- a/lib/eal/include/generic/rte_power_intrinsics.h
+++ b/lib/eal/include/generic/rte_power_intrinsics.h
@@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id);
 __rte_experimental
 int rte_power_pause(const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Monitor a set of addresses for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either one of the specified
+ * memory addresses is written to, a certain TSC timestamp is reached, or other
+ * reasons cause the CPU to wake up.
+ *
+ * Additionally, `expected` 64-bit values and 64-bit masks are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they do not match, the entering 
of
+ * optimized power state may be aborted.
+ *
+ * @warning It is responsibility of the user to check if this function is
+ *   supported at runtime using `rte_cpu_get_intrinsics_support()` API call.
+ *   Failing to do so may result in an illegal CPU instruction error.
+ *
+ * @param pmc
+ *   An array of monitoring condition structures.
+ * @param num
+ *   Length of the `pmc` array.
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   0 on success
+ *   -EINVAL on invalid parameters
+ *   -ENOTSUP if unsupported
+ */
+__rte_experimental
+int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp);
+
 #endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/eal/ppc/rte_power_intrinsics.c 
b/lib/eal/ppc/rte_power_intrinsics.c
index 7fc9586da7..f00b58ade5 100644
--- a/lib/eal/ppc/rte_power_intrinsics.c
+++ b/lib/eal/ppc/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+

[dpdk-dev] [PATCH v4 4/7] power: remove thread safety from PMD power API's

2021-06-28 Thread Anatoly Burakov

Currently, we expect that only one callback can be active at any given
moment, for a particular queue configuration, which is relatively easy
to implement in a thread-safe way. However, we're about to add support
for multiple queues per lcore, which will greatly increase the
possibility of various race conditions.

We could have used something like an RCU for this use case, but absent
of a pressing need for thread safety we'll go the easy way and just
mandate that the API's are to be called when all affected ports are
stopped, and document this limitation. This greatly simplifies the
`rte_power_monitor`-related code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Add check for stopped queue
- Clarified doc message
- Added release notes

 doc/guides/rel_notes/release_21_08.rst |   5 +
 lib/power/meson.build  |   3 +
 lib/power/rte_power_pmd_mgmt.c | 133 ++---
 lib/power/rte_power_pmd_mgmt.h |   6 ++
 4 files changed, 67 insertions(+), 80 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index 9d1cfac395..f015c509fc 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -88,6 +88,11 @@ API Changes
 
 * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
+* rte_power: The experimental PMD power management API is no longer considered
+  to be thread safe; all Rx queues affected by the API will now need to be
+  stopped before making any changes to the power management scheme.
+
+
 ABI Changes
 ---
 
diff --git a/lib/power/meson.build b/lib/power/meson.build
index c1097d32f1..4f6a242364 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -21,4 +21,7 @@ headers = files(
 'rte_power_pmd_mgmt.h',
 'rte_power_guest_channel.h',
 )
+if cc.has_argument('-Wno-cast-qual')
+cflags += '-Wno-cast-qual'
+endif
 deps += ['timer', 'ethdev']
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..9b95cf1794 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -40,8 +40,6 @@ struct pmd_queue_cfg {
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
-   volatile bool umwait_in_progress;
-   /**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
 } __rte_cache_aligned;
@@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf 
**pkts __rte_unused,
struct rte_power_monitor_cond pmc;
uint16_t ret;
 
-   /*
-* we might get a cancellation request while being
-* inside the callback, in which case the wakeup
-* wouldn't work because it would've arrived too early.
-*
-* to get around this, we notify the other thread that
-* we're sleeping, so that it can spin until we're done.
-* unsolicited wakeups are perfectly safe.
-*/
-   q_conf->umwait_in_progress = true;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-   /* check if we need to cancel sleep */
-   if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-   /* use monitoring condition to sleep */
-   ret = rte_eth_get_monitor_addr(port_id, qidx,
-   &pmc);
-   if (ret == 0)
-   rte_power_monitor(&pmc, UINT64_MAX);
-   }
-   q_conf->umwait_in_progress = false;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+   /* use monitoring condition to sleep */
+   ret = rte_eth_get_monitor_addr(port_id, qidx,
+   &pmc);
+   if (ret == 0)
+   rte_power_monitor(&pmc, UINT64_MAX);
}
} else
q_conf->empty_poll_stats = 0;
@@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx,
return nb_rx;
 }
 
+static int
+queue_stopped(const uint16_t port_id, const uint16_t queue_id)
+{
+   struct rte_eth_rxq_info qinfo;
+
+   if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0)
+   return -1;
+
+   return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint

[dpdk-dev] [PATCH v4 5/7] power: support callbacks for multiple Rx queues

2021-06-28 Thread Anatoly Burakov

Currently, there is a hard limitation on the PMD power management
support that only allows it to support a single queue per lcore. This is
not ideal as most DPDK use cases will poll multiple queues per core.

The PMD power management mechanism relies on ethdev Rx callbacks, so it
is very difficult to implement such support because callbacks are
effectively stateless and have no visibility into what the other ethdev
devices are doing. This places limitations on what we can do within the
framework of Rx callbacks, but the basics of this implementation are as
follows:

- Replace per-queue structures with per-lcore ones, so that any device
  polled from the same lcore can share data
- Any queue that is going to be polled from a specific lcore has to be
  added to the list of cores to poll, so that the callback is aware of
  other queues being polled by the same lcore
- Both the empty poll counter and the actual power saving mechanism is
  shared between all queues polled on a particular lcore, and is only
  activated when a special designated "power saving" queue is polled. To
  put it another way, we have no idea which queue the user will poll in
  what order, so we rely on them telling us that queue X is the last one
  in the polling loop, so any power management should happen there.
- A new API is added to mark a specific Rx queue as "power saving".
  Failing to call this API will result in no power management, however
  when having only one queue per core it is obvious which queue is the
  "power saving" one, so things will still work without this new API for
  use cases that were previously working without it.
- The limitation on UMWAIT-based polling is not removed because UMWAIT
  is incapable of monitoring more than one address.

Also, while we're at it, update and improve the docs.

Signed-off-by: Anatoly Burakov 
---

Notes:
v3:
- Move the list of supported NICs to NIC feature table

v2:
- Use a TAILQ for queues instead of a static array
- Address feedback from Konstantin
- Add additional checks for stopped queues

 doc/guides/nics/features.rst   |  10 +
 doc/guides/prog_guide/power_man.rst|  75 +++--
 doc/guides/rel_notes/release_21_08.rst |   3 +
 lib/power/rte_power_pmd_mgmt.c | 381 -
 lib/power/rte_power_pmd_mgmt.h |  34 +++
 lib/power/version.map  |   3 +
 6 files changed, 412 insertions(+), 94 deletions(-)

diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index 403c2b03a3..a96e12d155 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information.
 * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``.
 * **[related] API**: ``rte_eth_rx_burst_mode_get()``, 
``rte_eth_tx_burst_mode_get()``.
 
+.. _nic_features_get_monitor_addr:
+
+PMD power management using monitor addresses
+
+
+Supports getting a monitoring condition to use together with Ethernet PMD power
+management (see :doc:`../prog_guide/power_man` for more details).
+
+* **[implements] eth_dev_ops**: ``get_monitor_addr``
+
 .. _nic_features_other:
 
 Other dev ops not represented by a Feature
diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index c70ae128ac..fac2c19516 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -198,34 +198,41 @@ Ethernet PMD Power Management API
 Abstract
 
 
-Existing power management mechanisms require developers
-to change application design or change code to make use of it.
-The PMD power management API provides a convenient alternative
-by utilizing Ethernet PMD RX callbacks,
-and triggering power saving whenever empty poll count reaches a certain number.
-
-Monitor
-   This power saving scheme will put the CPU into optimized power state
-   and use the ``rte_power_monitor()`` function
-   to monitor the Ethernet PMD RX descriptor address,
-   and wake the CPU up whenever there's new traffic.
-
-Pause
-   This power saving scheme will avoid busy polling
-   by either entering power-optimized sleep state
-   with ``rte_power_pause()`` function,
-   or, if it's not available, use ``rte_pause()``.
-
-Frequency scaling
-   This power saving scheme will use ``librte_power`` library
-   functionality to scale the core frequency up/down
-   depending on traffic volume.
-
-.. note::
-
-   Currently, this power management API is limited to mandatory mapping
-   of 1 queue to 1 core (multiple queues are supported,
-   but they must be polled from different cores).
+Existing power management mechanisms require developers to change application
+design or change code to make use of it. The PMD power management API provides 
a
+convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering
+power savin

[dpdk-dev] [PATCH v4 6/7] power: support monitoring multiple Rx queues

2021-06-28 Thread Anatoly Burakov

Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
Rx queues while entering the energy efficient power state. The multi
version will be used unconditionally if supported, and the UMWAIT one
will only be used when multi-monitor is not supported by the hardware.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fix possible out of bounds access
- Added missing index increment

 doc/guides/prog_guide/power_man.rst |  9 ++--
 lib/power/rte_power_pmd_mgmt.c  | 84 -
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index fac2c19516..3245a5ebed 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain 
number.
 The "monitor" mode is only supported in the following configurations and 
scenarios:
 
 * If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor_multi()`` function is supported by the platform, then
+  monitoring multiple Ethernet Rx queues for traffic will be supported.
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
   ``rte_power_monitor()`` is supported by the platform, then monitoring will be
   limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
   monitored from a different lcore).
 
-* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
-  ``rte_power_monitor()`` function is not supported, then monitor mode will not
-  be supported.
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of 
the
+  two monitoring functions are supported, then monitor mode will not be 
supported.
 
 * Not all Ethernet devices support monitoring, even if the underlying
   platform may support the necessary CPU instructions. Please refer to
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index 7762cd39b8..97c9f1ea36 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -155,6 +155,32 @@ queue_list_remove(struct pmd_core_cfg *cfg, const union 
queue *q)
return 0;
 }
 
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+   struct rte_power_monitor_cond *pmc, size_t len)
+{
+   const struct queue_list_entry *qle;
+   size_t i = 0;
+   int ret;
+
+   TAILQ_FOREACH(qle, &cfg->head, next) {
+   const union queue *q = &qle->queue;
+   struct rte_power_monitor_cond *cur;
+
+   /* attempted out of bounds access */
+   if (i >= len) {
+   RTE_LOG(ERR, POWER, "Too many queues being 
monitored\n");
+   return -1;
+   }
+
+   cur = &pmc[i++];
+   ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+   if (ret < 0)
+   return ret;
+   }
+   return 0;
+}
+
 static void
 calc_tsc(void)
 {
@@ -183,6 +209,48 @@ calc_tsc(void)
}
 }
 
+static uint16_t
+clb_multiwait(uint16_t port_id, uint16_t qidx,
+   struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+   uint16_t max_pkts __rte_unused, void *addr __rte_unused)
+{
+   const unsigned int lcore = rte_lcore_id();
+   const union queue q = {.portid = port_id, .qid = qidx};
+   const bool empty = nb_rx == 0;
+   struct pmd_core_cfg *q_conf;
+
+   q_conf = &lcore_cfg[lcore];
+
+   /* early exit */
+   if (likely(!empty)) {
+   q_conf->empty_poll_stats = 0;
+   } else {
+   /* do we care about this particular queue? */
+   if (!queue_is_power_save(q_conf, &q))
+   return nb_rx;
+
+   /*
+* we can increment unconditionally here because if there were
+* non-empty polls in other queues assigned to this core, we
+* dropped the counter to zero anyway.
+*/
+   q_conf->empty_poll_stats++;
+   if (unlikely(q_conf->empty_poll_stats > EMPTYPOLL_MAX)) {
+   struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
+   uint16_t ret;
+
+   /* gather all monitoring conditions */
+   ret = get_monitor_addresses(q_conf, pmc, RTE_DIM(pmc));
+
+   if (ret == 0)
+   rte_power_monitor_multi(pmc,
+   q_conf->n_queues, UINT64_MAX);
+   }
+   }
+
+   return nb_rx;
+}
+
 static uint16_t
 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts 
__rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused,
@@ -348,14 +416,19 @@ static int
 check_monitor(struct pmd_core_cfg

[dpdk-dev] [PATCH v4 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes

2021-06-28 Thread Anatoly Burakov

Currently, l3fwd-power enforces the limitation of having one queue per
lcore. This is no longer necessary, so remove the limitation, and always
mark the last queue in qconf as the power save queue.

Signed-off-by: Anatoly Burakov 
---
 examples/l3fwd-power/main.c | 39 +++--
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f8dfed1634..3057c06936 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -2498,6 +2498,27 @@ mode_to_str(enum appmode mode)
}
 }
 
+static void
+pmd_pmgmt_set_up(unsigned int lcore, uint16_t portid, uint16_t qid, bool last)
+{
+   int ret;
+
+   ret = rte_power_ethdev_pmgmt_queue_enable(lcore, portid,
+   qid, pmgmt_type);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_power_ethdev_pmgmt_queue_enable: err=%d, 
port=%d\n",
+   ret, portid);
+
+   if (!last)
+   return;
+   ret = rte_power_ethdev_pmgmt_queue_set_power_save(lcore, portid, qid);
+   if (ret < 0)
+   rte_exit(EXIT_FAILURE,
+   "rte_power_ethdev_pmgmt_queue_set_power_save: err=%d, 
port=%d\n",
+   ret, portid);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -2723,12 +2744,6 @@ main(int argc, char **argv)
printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
fflush(stdout);
 
-   /* PMD power management mode can only do 1 queue per core */
-   if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) {
-   rte_exit(EXIT_FAILURE,
-   "In PMD power management mode, only one queue 
per lcore is allowed\n");
-   }
-
/* init RX queues */
for(queue = 0; queue < qconf->n_rx_queue; ++queue) {
struct rte_eth_rxconf rxq_conf;
@@ -2767,15 +2782,9 @@ main(int argc, char **argv)
 "Fail to add ptype cb\n");
}
 
-   if (app_mode == APP_MODE_PMD_MGMT) {
-   ret = rte_power_ethdev_pmgmt_queue_enable(
-   lcore_id, portid, queueid,
-   pmgmt_type);
-   if (ret < 0)
-   rte_exit(EXIT_FAILURE,
-   
"rte_power_ethdev_pmgmt_queue_enable: err=%d, port=%d\n",
-   ret, portid);
-   }
+   if (app_mode == APP_MODE_PMD_MGMT)
+   pmd_pmgmt_set_up(lcore_id, portid, queueid,
+   queue == (qconf->n_rx_queue - 1));
}
}
 
-- 
2.25.1

[dpdk-dev] [PATCH v5 2/7] net/af_xdp: add power monitor support

2021-06-29 Thread Anatoly Burakov

Implement support for .get_monitor_addr in AF_XDP driver.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Rewrite using the callback mechanism

 drivers/net/af_xdp/rte_eth_af_xdp.c | 34 +
 1 file changed, 34 insertions(+)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c
index eb5660a3dc..7830d0c23a 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "compat.h"
 
@@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+#define CLB_VAL_IDX 0
+static int
+eth_monitor_callback(const uint64_t value,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   const uint64_t v = opaque[CLB_VAL_IDX];
+   const uint64_t m = (uint32_t)~0;
+
+   /* if the value has changed, abort entering power optimized state */
+   return (value & m) == v ? 0 : -1;
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+   struct pkt_rx_queue *rxq = rx_queue;
+   unsigned int *prod = rxq->rx.producer;
+   const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
+
+   /* watch for changes in producer ring */
+   pmc->addr = (void*)prod;
+
+   /* store current value */
+   pmc->opaque[CLB_VAL_IDX] = cur_val;
+   pmc->fn = eth_monitor_callback;
+
+   /* AF_XDP producer ring index is 32-bit */
+   pmc->size = sizeof(uint32_t);
+
+   return 0;
+}
+
 static int
 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
@@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = {
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
+   .get_monitor_addr = eth_get_monitor_addr
 };
 
 /** parse busy_budget argument */
-- 
2.25.1

[dpdk-dev] [PATCH v5 0/7] Enhancements for PMD power management

2021-06-29 Thread Anatoly Burakov

This patchset introduces several changes related to PMD power management:

- Changed monitoring intrinsics to use callbacks as a comparison function, based
  on previous patchset [1] but incorporating feedback [2] - this hopefully will
  make it possible to add support for .get_monitor_addr in virtio
- Add a new intrinsic to monitor multiple addresses, based on RTM instruction
  set and the TPAUSE instruction
- Add support for PMD power management on multiple queues, as well as all
  accompanying infrastructure and example apps changes

v5:
- Removed "power save queue" API and replaced with mechanism suggested by
  Konstantin
- Addressed other feedback

v4:
- Replaced raw number with a macro
- Fixed all the bugs found by Konstantin
- Some other minor corrections

v3:
- Moved some doc updates to NIC features list

v2:
- Changed check inversion to callbacks
- Addressed feedback from Konstantin
- Added doc updates where necessary

[1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=*
[2] 
http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274

Anatoly Burakov (7):
  power_intrinsics: use callbacks for comparison
  net/af_xdp: add power monitor support
  eal: add power monitor for multiple events
  power: remove thread safety from PMD power API's
  power: support callbacks for multiple Rx queues
  power: support monitoring multiple Rx queues
  l3fwd-power: support multiqueue in PMD pmgmt modes

 doc/guides/nics/features.rst  |  10 +
 doc/guides/prog_guide/power_man.rst   |  68 +-
 doc/guides/rel_notes/release_21_08.rst|  11 +
 drivers/event/dlb2/dlb2.c |  17 +-
 drivers/net/af_xdp/rte_eth_af_xdp.c   |  34 +
 drivers/net/i40e/i40e_rxtx.c  |  20 +-
 drivers/net/iavf/iavf_rxtx.c  |  20 +-
 drivers/net/ice/ice_rxtx.c|  20 +-
 drivers/net/ixgbe/ixgbe_rxtx.c|  20 +-
 drivers/net/mlx5/mlx5_rx.c|  17 +-
 examples/l3fwd-power/main.c   |   6 -
 lib/eal/arm/rte_power_intrinsics.c|  11 +
 lib/eal/include/generic/rte_cpuflags.h|   2 +
 .../include/generic/rte_power_intrinsics.h|  68 +-
 lib/eal/ppc/rte_power_intrinsics.c|  11 +
 lib/eal/version.map   |   3 +
 lib/eal/x86/rte_cpuflags.c|   2 +
 lib/eal/x86/rte_power_intrinsics.c|  90 ++-
 lib/power/meson.build |   3 +
 lib/power/rte_power_pmd_mgmt.c| 633 +-
 lib/power/rte_power_pmd_mgmt.h|   6 +
 21 files changed, 810 insertions(+), 262 deletions(-)

-- 
2.25.1

[dpdk-dev] [PATCH v5 3/7] eal: add power monitor for multiple events

2021-06-29 Thread Anatoly Burakov

Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fixed bugs in accessing the monitor condition
- Abort on any monitor condition not having a defined callback

v2:
- Adapt to callback mechanism

 doc/guides/rel_notes/release_21_08.rst|  2 +
 lib/eal/arm/rte_power_intrinsics.c| 11 +++
 lib/eal/include/generic/rte_cpuflags.h|  2 +
 .../include/generic/rte_power_intrinsics.h| 35 +
 lib/eal/ppc/rte_power_intrinsics.c| 11 +++
 lib/eal/version.map   |  3 +
 lib/eal/x86/rte_cpuflags.c|  2 +
 lib/eal/x86/rte_power_intrinsics.c| 73 +++
 8 files changed, 139 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index c84ac280f5..9d1cfac395 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,8 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* eal: added ``rte_power_monitor_multi`` to support waiting for multiple 
events.
+
 
 Removed Items
 -
diff --git a/lib/eal/arm/rte_power_intrinsics.c 
b/lib/eal/arm/rte_power_intrinsics.c
index e83f04072a..78f55b7203 100644
--- a/lib/eal/arm/rte_power_intrinsics.c
+++ b/lib/eal/arm/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+   RTE_SET_USED(tsc_timestamp);
+
+   return -ENOTSUP;
+}
diff --git a/lib/eal/include/generic/rte_cpuflags.h 
b/lib/eal/include/generic/rte_cpuflags.h
index 28a5aecde8..d35551e931 100644
--- a/lib/eal/include/generic/rte_cpuflags.h
+++ b/lib/eal/include/generic/rte_cpuflags.h
@@ -24,6 +24,8 @@ struct rte_cpu_intrinsics {
/**< indicates support for rte_power_monitor function */
uint32_t power_pause : 1;
/**< indicates support for rte_power_pause function */
+   uint32_t power_monitor_multi : 1;
+   /**< indicates support for rte_power_monitor_multi function */
 };
 
 /**
diff --git a/lib/eal/include/generic/rte_power_intrinsics.h 
b/lib/eal/include/generic/rte_power_intrinsics.h
index c9aa52a86d..04e8c2ab37 100644
--- a/lib/eal/include/generic/rte_power_intrinsics.h
+++ b/lib/eal/include/generic/rte_power_intrinsics.h
@@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id);
 __rte_experimental
 int rte_power_pause(const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Monitor a set of addresses for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either one of the specified
+ * memory addresses is written to, a certain TSC timestamp is reached, or other
+ * reasons cause the CPU to wake up.
+ *
+ * Additionally, `expected` 64-bit values and 64-bit masks are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they do not match, the entering 
of
+ * optimized power state may be aborted.
+ *
+ * @warning It is responsibility of the user to check if this function is
+ *   supported at runtime using `rte_cpu_get_intrinsics_support()` API call.
+ *   Failing to do so may result in an illegal CPU instruction error.
+ *
+ * @param pmc
+ *   An array of monitoring condition structures.
+ * @param num
+ *   Length of the `pmc` array.
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   0 on success
+ *   -EINVAL on invalid parameters
+ *   -ENOTSUP if unsupported
+ */
+__rte_experimental
+int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp);
+
 #endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/eal/ppc/rte_power_intrinsics.c 
b/lib/eal/ppc/rte_power_intrinsics.c
index 7fc9586da7..f00b58ade5 100644
--- a/lib/eal/ppc/rte_power_intrinsics.c
+++ b/lib/eal/ppc/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+

[dpdk-dev] [PATCH v5 1/7] power_intrinsics: use callbacks for comparison

2021-06-29 Thread Anatoly Burakov

Previously, the semantics of power monitor were such that we were
checking current value against the expected value, and if they matched,
then the sleep was aborted. This is somewhat inflexible, because it only
allowed us to check for a specific value in a specific way.

This commit replaces the comparison with a user callback mechanism, so
that any PMD (or other code) using `rte_power_monitor()` can define
their own comparison semantics and decision making on how to detect the
need to abort the entering of power optimized state.

Existing implementations are adjusted to follow the new semantics.

Suggested-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
Acked-by: Konstantin Ananyev 
---

Notes:
v4:
- Return error if callback is set to NULL
- Replace raw number with a macro in monitor condition opaque data

v2:
- Use callback mechanism for more flexibility
- Address feedback from Konstantin

 doc/guides/rel_notes/release_21_08.rst|  1 +
 drivers/event/dlb2/dlb2.c | 17 --
 drivers/net/i40e/i40e_rxtx.c  | 20 +++
 drivers/net/iavf/iavf_rxtx.c  | 20 +++
 drivers/net/ice/ice_rxtx.c| 20 +++
 drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++
 drivers/net/mlx5/mlx5_rx.c| 17 --
 .../include/generic/rte_power_intrinsics.h| 33 +++
 lib/eal/x86/rte_power_intrinsics.c| 17 +-
 9 files changed, 121 insertions(+), 44 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf3ce..c84ac280f5 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -84,6 +84,7 @@ API Changes
Also, make sure to start the actual text at the margin.
===
 
+* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
 ABI Changes
 ---
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index eca183753f..252bbd8d5e 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num)
}
 }
 
+#define CLB_MASK_IDX 0
+#define CLB_VAL_IDX 1
+static int
+dlb2_monitor_callback(const uint64_t val,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   /* abort if the value matches */
+   return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0;
+}
+
 static inline int
 dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
  struct dlb2_eventdev_port *ev_port,
@@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
expected_value = 0;
 
pmc.addr = monitor_addr;
-   pmc.val = expected_value;
-   pmc.mask = qe_mask.raw_qe[1];
+   /* store expected value and comparison mask in opaque data */
+   pmc.opaque[CLB_VAL_IDX] = expected_value;
+   pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
+   /* set up callback */
+   pmc.fn = dlb2_monitor_callback;
pmc.size = sizeof(uint64_t);
 
rte_power_monitor(&pmc, timeout + start_ticks);
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 6c58decece..081682f88b 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -81,6 +81,18 @@
 #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
(PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
 
+static int
+i40e_monitor_callback(const uint64_t value,
+   const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused)
+{
+   const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /*
+* we expect the DD bit to be set to 1 if this descriptor was already
+* written to.
+*/
+   return (value & m) == m ? -1 : 0;
+}
+
 int
 i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
 {
@@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct 
rte_power_monitor_cond *pmc)
/* watch for changes in status bit */
pmc->addr = &rxdp->wb.qword1.status_error_len;
 
-   /*
-* we expect the DD bit to be set to 1 if this descriptor was already
-* written to.
-*/
-   pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
-   pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /* comparison callback */
+   pmc->fn = i40e_monitor_callback;
 
/* registers are 64-bit */
pmc->size = sizeof(uint64_t);
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0361af0d85..7ed196ec22 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rx

[dpdk-dev] [PATCH v5 4/7] power: remove thread safety from PMD power API's

2021-06-29 Thread Anatoly Burakov

Currently, we expect that only one callback can be active at any given
moment, for a particular queue configuration, which is relatively easy
to implement in a thread-safe way. However, we're about to add support
for multiple queues per lcore, which will greatly increase the
possibility of various race conditions.

We could have used something like an RCU for this use case, but absent
of a pressing need for thread safety we'll go the easy way and just
mandate that the API's are to be called when all affected ports are
stopped, and document this limitation. This greatly simplifies the
`rte_power_monitor`-related code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Add check for stopped queue
- Clarified doc message
- Added release notes

 doc/guides/rel_notes/release_21_08.rst |   5 +
 lib/power/meson.build  |   3 +
 lib/power/rte_power_pmd_mgmt.c | 133 ++---
 lib/power/rte_power_pmd_mgmt.h |   6 ++
 4 files changed, 67 insertions(+), 80 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index 9d1cfac395..f015c509fc 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -88,6 +88,11 @@ API Changes
 
 * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
+* rte_power: The experimental PMD power management API is no longer considered
+  to be thread safe; all Rx queues affected by the API will now need to be
+  stopped before making any changes to the power management scheme.
+
+
 ABI Changes
 ---
 
diff --git a/lib/power/meson.build b/lib/power/meson.build
index c1097d32f1..4f6a242364 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -21,4 +21,7 @@ headers = files(
 'rte_power_pmd_mgmt.h',
 'rte_power_guest_channel.h',
 )
+if cc.has_argument('-Wno-cast-qual')
+cflags += '-Wno-cast-qual'
+endif
 deps += ['timer', 'ethdev']
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..9b95cf1794 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -40,8 +40,6 @@ struct pmd_queue_cfg {
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
-   volatile bool umwait_in_progress;
-   /**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
 } __rte_cache_aligned;
@@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf 
**pkts __rte_unused,
struct rte_power_monitor_cond pmc;
uint16_t ret;
 
-   /*
-* we might get a cancellation request while being
-* inside the callback, in which case the wakeup
-* wouldn't work because it would've arrived too early.
-*
-* to get around this, we notify the other thread that
-* we're sleeping, so that it can spin until we're done.
-* unsolicited wakeups are perfectly safe.
-*/
-   q_conf->umwait_in_progress = true;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-   /* check if we need to cancel sleep */
-   if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-   /* use monitoring condition to sleep */
-   ret = rte_eth_get_monitor_addr(port_id, qidx,
-   &pmc);
-   if (ret == 0)
-   rte_power_monitor(&pmc, UINT64_MAX);
-   }
-   q_conf->umwait_in_progress = false;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+   /* use monitoring condition to sleep */
+   ret = rte_eth_get_monitor_addr(port_id, qidx,
+   &pmc);
+   if (ret == 0)
+   rte_power_monitor(&pmc, UINT64_MAX);
}
} else
q_conf->empty_poll_stats = 0;
@@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx,
return nb_rx;
 }
 
+static int
+queue_stopped(const uint16_t port_id, const uint16_t queue_id)
+{
+   struct rte_eth_rxq_info qinfo;
+
+   if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0)
+   return -1;
+
+   return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint

[dpdk-dev] [PATCH v5 6/7] power: support monitoring multiple Rx queues

2021-06-29 Thread Anatoly Burakov

Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
Rx queues while entering the energy efficient power state. The multi
version will be used unconditionally if supported, and the UMWAIT one
will only be used when multi-monitor is not supported by the hardware.

Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fix possible out of bounds access
- Added missing index increment

 doc/guides/prog_guide/power_man.rst |  9 ++--
 lib/power/rte_power_pmd_mgmt.c  | 81 -
 2 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index ec04a72108..94353ca012 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain 
number.
 The "monitor" mode is only supported in the following configurations and 
scenarios:
 
 * If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor_multi()`` function is supported by the platform, then
+  monitoring multiple Ethernet Rx queues for traffic will be supported.
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
   ``rte_power_monitor()`` is supported by the platform, then monitoring will be
   limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
   monitored from a different lcore).
 
-* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
-  ``rte_power_monitor()`` function is not supported, then monitor mode will not
-  be supported.
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of 
the
+  two monitoring functions are supported, then monitor mode will not be 
supported.
 
 * Not all Ethernet drivers support monitoring, even if the underlying
   platform may support the necessary CPU instructions. Please refer to
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index fccfd236c2..2056996b9c 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -124,6 +124,32 @@ queue_list_take(struct pmd_core_cfg *cfg, const union 
queue *q)
return found;
 }
 
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+   struct rte_power_monitor_cond *pmc, size_t len)
+{
+   const struct queue_list_entry *qle;
+   size_t i = 0;
+   int ret;
+
+   TAILQ_FOREACH(qle, &cfg->head, next) {
+   const union queue *q = &qle->queue;
+   struct rte_power_monitor_cond *cur;
+
+   /* attempted out of bounds access */
+   if (i >= len) {
+   RTE_LOG(ERR, POWER, "Too many queues being 
monitored\n");
+   return -1;
+   }
+
+   cur = &pmc[i++];
+   ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+   if (ret < 0)
+   return ret;
+   }
+   return 0;
+}
+
 static void
 calc_tsc(void)
 {
@@ -190,6 +216,45 @@ lcore_can_sleep(struct pmd_core_cfg *cfg)
return true;
 }
 
+static uint16_t
+clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
+   struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+   uint16_t max_pkts __rte_unused, void *arg)
+{
+   const unsigned int lcore = rte_lcore_id();
+   struct queue_list_entry *queue_conf = arg;
+   struct pmd_core_cfg *lcore_conf;
+   const bool empty = nb_rx == 0;
+
+   lcore_conf = &lcore_cfgs[lcore];
+
+   /* early exit */
+   if (likely(!empty))
+   /* early exit */
+   queue_reset(lcore_conf, queue_conf);
+   else {
+   struct rte_power_monitor_cond pmc[RTE_MAX_ETHPORTS];
+   int ret;
+
+   /* can this queue sleep? */
+   if (!queue_can_sleep(lcore_conf, queue_conf))
+   return nb_rx;
+
+   /* can this lcore sleep? */
+   if (!lcore_can_sleep(lcore_conf))
+   return nb_rx;
+
+   /* gather all monitoring conditions */
+   ret = get_monitor_addresses(lcore_conf, pmc, RTE_DIM(pmc));
+   if (ret < 0)
+   return nb_rx;
+
+   rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX);
+   }
+
+   return nb_rx;
+}
+
 static uint16_t
 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts 
__rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg)
@@ -341,14 +406,19 @@ static int
 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
 {
struct rte_power_monitor_cond dummy;
+   bool multimonitor_supported;
 
/* check if rte_power_monitor is supported */
if (!global_data.intrinsics_support.power_monitor) {

[dpdk-dev] [PATCH v5 5/7] power: support callbacks for multiple Rx queues

2021-06-29 Thread Anatoly Burakov

Currently, there is a hard limitation on the PMD power management
support that only allows it to support a single queue per lcore. This is
not ideal as most DPDK use cases will poll multiple queues per core.

The PMD power management mechanism relies on ethdev Rx callbacks, so it
is very difficult to implement such support because callbacks are
effectively stateless and have no visibility into what the other ethdev
devices are doing. This places limitations on what we can do within the
framework of Rx callbacks, but the basics of this implementation are as
follows:

- Replace per-queue structures with per-lcore ones, so that any device
  polled from the same lcore can share data
- Any queue that is going to be polled from a specific lcore has to be
  added to the list of queues to poll, so that the callback is aware of
  other queues being polled by the same lcore
- Both the empty poll counter and the actual power saving mechanism is
  shared between all queues polled on a particular lcore, and is only
  activated when all queues in the list were polled and were determined
  to have no traffic.
- The limitation on UMWAIT-based polling is not removed because UMWAIT
  is incapable of monitoring more than one address.

Also, while we're at it, update and improve the docs.

Signed-off-by: Anatoly Burakov 
---

Notes:
v5:
- Remove the "power save queue" API and replace it with mechanism suggested 
by
  Konstantin

v3:
- Move the list of supported NICs to NIC feature table

v2:
- Use a TAILQ for queues instead of a static array
- Address feedback from Konstantin
- Add additional checks for stopped queues

 doc/guides/nics/features.rst   |  10 +
 doc/guides/prog_guide/power_man.rst|  65 ++--
 doc/guides/rel_notes/release_21_08.rst |   3 +
 lib/power/rte_power_pmd_mgmt.c | 431 ++---
 4 files changed, 373 insertions(+), 136 deletions(-)

diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index 403c2b03a3..a96e12d155 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information.
 * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``.
 * **[related] API**: ``rte_eth_rx_burst_mode_get()``, 
``rte_eth_tx_burst_mode_get()``.
 
+.. _nic_features_get_monitor_addr:
+
+PMD power management using monitor addresses
+
+
+Supports getting a monitoring condition to use together with Ethernet PMD power
+management (see :doc:`../prog_guide/power_man` for more details).
+
+* **[implements] eth_dev_ops**: ``get_monitor_addr``
+
 .. _nic_features_other:
 
 Other dev ops not represented by a Feature
diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index c70ae128ac..ec04a72108 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -198,34 +198,41 @@ Ethernet PMD Power Management API
 Abstract
 
 
-Existing power management mechanisms require developers
-to change application design or change code to make use of it.
-The PMD power management API provides a convenient alternative
-by utilizing Ethernet PMD RX callbacks,
-and triggering power saving whenever empty poll count reaches a certain number.
-
-Monitor
-   This power saving scheme will put the CPU into optimized power state
-   and use the ``rte_power_monitor()`` function
-   to monitor the Ethernet PMD RX descriptor address,
-   and wake the CPU up whenever there's new traffic.
-
-Pause
-   This power saving scheme will avoid busy polling
-   by either entering power-optimized sleep state
-   with ``rte_power_pause()`` function,
-   or, if it's not available, use ``rte_pause()``.
-
-Frequency scaling
-   This power saving scheme will use ``librte_power`` library
-   functionality to scale the core frequency up/down
-   depending on traffic volume.
-
-.. note::
-
-   Currently, this power management API is limited to mandatory mapping
-   of 1 queue to 1 core (multiple queues are supported,
-   but they must be polled from different cores).
+Existing power management mechanisms require developers to change application
+design or change code to make use of it. The PMD power management API provides 
a
+convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering
+power saving whenever empty poll count reaches a certain number.
+
+* Monitor
+   This power saving scheme will put the CPU into optimized power state and
+   monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever
+   there's new traffic. Support for this scheme may not be available on all
+   platforms, and further limitations may apply (see below).
+
+* Pause
+   This power saving scheme will avoid busy polling by either entering
+   power-optimized sleep state with ``rte_power_pause()`` function, or, if

[dpdk-dev] [PATCH v5 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes

2021-06-29 Thread Anatoly Burakov

Currently, l3fwd-power enforces the limitation of having one queue per
lcore. This is no longer necessary, so remove the limitation.

Signed-off-by: Anatoly Burakov 
---
 examples/l3fwd-power/main.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f8dfed1634..52f56dc405 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -2723,12 +2723,6 @@ main(int argc, char **argv)
printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
fflush(stdout);
 
-   /* PMD power management mode can only do 1 queue per core */
-   if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) {
-   rte_exit(EXIT_FAILURE,
-   "In PMD power management mode, only one queue 
per lcore is allowed\n");
-   }
-
/* init RX queues */
for(queue = 0; queue < qconf->n_rx_queue; ++queue) {
struct rte_eth_rxconf rxq_conf;
-- 
2.25.1

[dpdk-dev] [PATCH v6 0/7] Enhancements for PMD power management

2021-07-05 Thread Anatoly Burakov

This patchset introduces several changes related to PMD power management:

- Changed monitoring intrinsics to use callbacks as a comparison function, based
  on previous patchset [1] but incorporating feedback [2] - this hopefully will
  make it possible to add support for .get_monitor_addr in virtio
- Add a new intrinsic to monitor multiple addresses, based on RTM instruction
  set and the TPAUSE instruction
- Add support for PMD power management on multiple queues, as well as all
  accompanying infrastructure and example apps changes

v6:
- Improved the algorithm for multi-queue sleep
- Fixed segfault and addressed other feedback

v5:
- Removed "power save queue" API and replaced with mechanism suggested by
  Konstantin
- Addressed other feedback

v4:
- Replaced raw number with a macro
- Fixed all the bugs found by Konstantin
- Some other minor corrections

v3:
- Moved some doc updates to NIC features list

v2:
- Changed check inversion to callbacks
- Addressed feedback from Konstantin
- Added doc updates where necessary

[1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=*
[2] 
http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274

Anatoly Burakov (7):
  power_intrinsics: use callbacks for comparison
  net/af_xdp: add power monitor support
  eal: add power monitor for multiple events
  power: remove thread safety from PMD power API's
  power: support callbacks for multiple Rx queues
  power: support monitoring multiple Rx queues
  l3fwd-power: support multiqueue in PMD pmgmt modes

 doc/guides/nics/features.rst  |  10 +
 doc/guides/prog_guide/power_man.rst   |  68 +-
 doc/guides/rel_notes/release_21_08.rst|  11 +
 drivers/event/dlb2/dlb2.c |  17 +-
 drivers/net/af_xdp/rte_eth_af_xdp.c   |  34 +
 drivers/net/i40e/i40e_rxtx.c  |  20 +-
 drivers/net/iavf/iavf_rxtx.c  |  20 +-
 drivers/net/ice/ice_rxtx.c|  20 +-
 drivers/net/ixgbe/ixgbe_rxtx.c|  20 +-
 drivers/net/mlx5/mlx5_rx.c|  17 +-
 examples/l3fwd-power/main.c   |   6 -
 lib/eal/arm/rte_power_intrinsics.c|  11 +
 lib/eal/include/generic/rte_cpuflags.h|   2 +
 .../include/generic/rte_power_intrinsics.h|  68 +-
 lib/eal/ppc/rte_power_intrinsics.c|  11 +
 lib/eal/version.map   |   3 +
 lib/eal/x86/rte_cpuflags.c|   2 +
 lib/eal/x86/rte_power_intrinsics.c|  90 ++-
 lib/power/meson.build |   3 +
 lib/power/rte_power_pmd_mgmt.c| 655 +-
 lib/power/rte_power_pmd_mgmt.h|   6 +
 21 files changed, 832 insertions(+), 262 deletions(-)

-- 
2.25.1

[dpdk-dev] [PATCH v6 1/7] power_intrinsics: use callbacks for comparison

2021-07-05 Thread Anatoly Burakov

Previously, the semantics of power monitor were such that we were
checking current value against the expected value, and if they matched,
then the sleep was aborted. This is somewhat inflexible, because it only
allowed us to check for a specific value in a specific way.

This commit replaces the comparison with a user callback mechanism, so
that any PMD (or other code) using `rte_power_monitor()` can define
their own comparison semantics and decision making on how to detect the
need to abort the entering of power optimized state.

Existing implementations are adjusted to follow the new semantics.

Suggested-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
Acked-by: Konstantin Ananyev 
---

Notes:
v4:
- Return error if callback is set to NULL
- Replace raw number with a macro in monitor condition opaque data

v2:
- Use callback mechanism for more flexibility
- Address feedback from Konstantin

 doc/guides/rel_notes/release_21_08.rst|  1 +
 drivers/event/dlb2/dlb2.c | 17 --
 drivers/net/i40e/i40e_rxtx.c  | 20 +++
 drivers/net/iavf/iavf_rxtx.c  | 20 +++
 drivers/net/ice/ice_rxtx.c| 20 +++
 drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++
 drivers/net/mlx5/mlx5_rx.c| 17 --
 .../include/generic/rte_power_intrinsics.h| 33 +++
 lib/eal/x86/rte_power_intrinsics.c| 17 +-
 9 files changed, 121 insertions(+), 44 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf3ce..c84ac280f5 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -84,6 +84,7 @@ API Changes
Also, make sure to start the actual text at the margin.
===
 
+* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
 ABI Changes
 ---
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index eca183753f..252bbd8d5e 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num)
}
 }
 
+#define CLB_MASK_IDX 0
+#define CLB_VAL_IDX 1
+static int
+dlb2_monitor_callback(const uint64_t val,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   /* abort if the value matches */
+   return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0;
+}
+
 static inline int
 dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
  struct dlb2_eventdev_port *ev_port,
@@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
expected_value = 0;
 
pmc.addr = monitor_addr;
-   pmc.val = expected_value;
-   pmc.mask = qe_mask.raw_qe[1];
+   /* store expected value and comparison mask in opaque data */
+   pmc.opaque[CLB_VAL_IDX] = expected_value;
+   pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
+   /* set up callback */
+   pmc.fn = dlb2_monitor_callback;
pmc.size = sizeof(uint64_t);
 
rte_power_monitor(&pmc, timeout + start_ticks);
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 6c58decece..081682f88b 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -81,6 +81,18 @@
 #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
(PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
 
+static int
+i40e_monitor_callback(const uint64_t value,
+   const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused)
+{
+   const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /*
+* we expect the DD bit to be set to 1 if this descriptor was already
+* written to.
+*/
+   return (value & m) == m ? -1 : 0;
+}
+
 int
 i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
 {
@@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct 
rte_power_monitor_cond *pmc)
/* watch for changes in status bit */
pmc->addr = &rxdp->wb.qword1.status_error_len;
 
-   /*
-* we expect the DD bit to be set to 1 if this descriptor was already
-* written to.
-*/
-   pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
-   pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /* comparison callback */
+   pmc->fn = i40e_monitor_callback;
 
/* registers are 64-bit */
pmc->size = sizeof(uint64_t);
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0361af0d85..7ed196ec22 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rx

[dpdk-dev] [PATCH v6 2/7] net/af_xdp: add power monitor support

2021-07-05 Thread Anatoly Burakov

Implement support for .get_monitor_addr in AF_XDP driver.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Rewrite using the callback mechanism

 drivers/net/af_xdp/rte_eth_af_xdp.c | 34 +
 1 file changed, 34 insertions(+)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c
index eb5660a3dc..7830d0c23a 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "compat.h"
 
@@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+#define CLB_VAL_IDX 0
+static int
+eth_monitor_callback(const uint64_t value,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   const uint64_t v = opaque[CLB_VAL_IDX];
+   const uint64_t m = (uint32_t)~0;
+
+   /* if the value has changed, abort entering power optimized state */
+   return (value & m) == v ? 0 : -1;
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+   struct pkt_rx_queue *rxq = rx_queue;
+   unsigned int *prod = rxq->rx.producer;
+   const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
+
+   /* watch for changes in producer ring */
+   pmc->addr = (void*)prod;
+
+   /* store current value */
+   pmc->opaque[CLB_VAL_IDX] = cur_val;
+   pmc->fn = eth_monitor_callback;
+
+   /* AF_XDP producer ring index is 32-bit */
+   pmc->size = sizeof(uint32_t);
+
+   return 0;
+}
+
 static int
 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
@@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = {
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
+   .get_monitor_addr = eth_get_monitor_addr
 };
 
 /** parse busy_budget argument */
-- 
2.25.1

[dpdk-dev] [PATCH v6 3/7] eal: add power monitor for multiple events

2021-07-05 Thread Anatoly Burakov

Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fixed bugs in accessing the monitor condition
- Abort on any monitor condition not having a defined callback

v2:
- Adapt to callback mechanism

 doc/guides/rel_notes/release_21_08.rst|  2 +
 lib/eal/arm/rte_power_intrinsics.c| 11 +++
 lib/eal/include/generic/rte_cpuflags.h|  2 +
 .../include/generic/rte_power_intrinsics.h| 35 +
 lib/eal/ppc/rte_power_intrinsics.c| 11 +++
 lib/eal/version.map   |  3 +
 lib/eal/x86/rte_cpuflags.c|  2 +
 lib/eal/x86/rte_power_intrinsics.c| 73 +++
 8 files changed, 139 insertions(+)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index c84ac280f5..9d1cfac395 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,8 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* eal: added ``rte_power_monitor_multi`` to support waiting for multiple 
events.
+
 
 Removed Items
 -
diff --git a/lib/eal/arm/rte_power_intrinsics.c 
b/lib/eal/arm/rte_power_intrinsics.c
index e83f04072a..78f55b7203 100644
--- a/lib/eal/arm/rte_power_intrinsics.c
+++ b/lib/eal/arm/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+   RTE_SET_USED(tsc_timestamp);
+
+   return -ENOTSUP;
+}
diff --git a/lib/eal/include/generic/rte_cpuflags.h 
b/lib/eal/include/generic/rte_cpuflags.h
index 28a5aecde8..d35551e931 100644
--- a/lib/eal/include/generic/rte_cpuflags.h
+++ b/lib/eal/include/generic/rte_cpuflags.h
@@ -24,6 +24,8 @@ struct rte_cpu_intrinsics {
/**< indicates support for rte_power_monitor function */
uint32_t power_pause : 1;
/**< indicates support for rte_power_pause function */
+   uint32_t power_monitor_multi : 1;
+   /**< indicates support for rte_power_monitor_multi function */
 };
 
 /**
diff --git a/lib/eal/include/generic/rte_power_intrinsics.h 
b/lib/eal/include/generic/rte_power_intrinsics.h
index c9aa52a86d..04e8c2ab37 100644
--- a/lib/eal/include/generic/rte_power_intrinsics.h
+++ b/lib/eal/include/generic/rte_power_intrinsics.h
@@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id);
 __rte_experimental
 int rte_power_pause(const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Monitor a set of addresses for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either one of the specified
+ * memory addresses is written to, a certain TSC timestamp is reached, or other
+ * reasons cause the CPU to wake up.
+ *
+ * Additionally, `expected` 64-bit values and 64-bit masks are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they do not match, the entering 
of
+ * optimized power state may be aborted.
+ *
+ * @warning It is responsibility of the user to check if this function is
+ *   supported at runtime using `rte_cpu_get_intrinsics_support()` API call.
+ *   Failing to do so may result in an illegal CPU instruction error.
+ *
+ * @param pmc
+ *   An array of monitoring condition structures.
+ * @param num
+ *   Length of the `pmc` array.
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   0 on success
+ *   -EINVAL on invalid parameters
+ *   -ENOTSUP if unsupported
+ */
+__rte_experimental
+int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp);
+
 #endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/eal/ppc/rte_power_intrinsics.c 
b/lib/eal/ppc/rte_power_intrinsics.c
index 7fc9586da7..f00b58ade5 100644
--- a/lib/eal/ppc/rte_power_intrinsics.c
+++ b/lib/eal/ppc/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+

[dpdk-dev] [PATCH v6 4/7] power: remove thread safety from PMD power API's

2021-07-05 Thread Anatoly Burakov

Currently, we expect that only one callback can be active at any given
moment, for a particular queue configuration, which is relatively easy
to implement in a thread-safe way. However, we're about to add support
for multiple queues per lcore, which will greatly increase the
possibility of various race conditions.

We could have used something like an RCU for this use case, but absent
of a pressing need for thread safety we'll go the easy way and just
mandate that the API's are to be called when all affected ports are
stopped, and document this limitation. This greatly simplifies the
`rte_power_monitor`-related code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Add check for stopped queue
- Clarified doc message
- Added release notes

 doc/guides/rel_notes/release_21_08.rst |   5 +
 lib/power/meson.build  |   3 +
 lib/power/rte_power_pmd_mgmt.c | 133 ++---
 lib/power/rte_power_pmd_mgmt.h |   6 ++
 4 files changed, 67 insertions(+), 80 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index 9d1cfac395..f015c509fc 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -88,6 +88,11 @@ API Changes
 
 * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
+* rte_power: The experimental PMD power management API is no longer considered
+  to be thread safe; all Rx queues affected by the API will now need to be
+  stopped before making any changes to the power management scheme.
+
+
 ABI Changes
 ---
 
diff --git a/lib/power/meson.build b/lib/power/meson.build
index c1097d32f1..4f6a242364 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -21,4 +21,7 @@ headers = files(
 'rte_power_pmd_mgmt.h',
 'rte_power_guest_channel.h',
 )
+if cc.has_argument('-Wno-cast-qual')
+cflags += '-Wno-cast-qual'
+endif
 deps += ['timer', 'ethdev']
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..9b95cf1794 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -40,8 +40,6 @@ struct pmd_queue_cfg {
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
-   volatile bool umwait_in_progress;
-   /**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
 } __rte_cache_aligned;
@@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf 
**pkts __rte_unused,
struct rte_power_monitor_cond pmc;
uint16_t ret;
 
-   /*
-* we might get a cancellation request while being
-* inside the callback, in which case the wakeup
-* wouldn't work because it would've arrived too early.
-*
-* to get around this, we notify the other thread that
-* we're sleeping, so that it can spin until we're done.
-* unsolicited wakeups are perfectly safe.
-*/
-   q_conf->umwait_in_progress = true;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-   /* check if we need to cancel sleep */
-   if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-   /* use monitoring condition to sleep */
-   ret = rte_eth_get_monitor_addr(port_id, qidx,
-   &pmc);
-   if (ret == 0)
-   rte_power_monitor(&pmc, UINT64_MAX);
-   }
-   q_conf->umwait_in_progress = false;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+   /* use monitoring condition to sleep */
+   ret = rte_eth_get_monitor_addr(port_id, qidx,
+   &pmc);
+   if (ret == 0)
+   rte_power_monitor(&pmc, UINT64_MAX);
}
} else
q_conf->empty_poll_stats = 0;
@@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx,
return nb_rx;
 }
 
+static int
+queue_stopped(const uint16_t port_id, const uint16_t queue_id)
+{
+   struct rte_eth_rxq_info qinfo;
+
+   if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0)
+   return -1;
+
+   return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint

[dpdk-dev] [PATCH v6 5/7] power: support callbacks for multiple Rx queues

2021-07-05 Thread Anatoly Burakov

Currently, there is a hard limitation on the PMD power management
support that only allows it to support a single queue per lcore. This is
not ideal as most DPDK use cases will poll multiple queues per core.

The PMD power management mechanism relies on ethdev Rx callbacks, so it
is very difficult to implement such support because callbacks are
effectively stateless and have no visibility into what the other ethdev
devices are doing. This places limitations on what we can do within the
framework of Rx callbacks, but the basics of this implementation are as
follows:

- Replace per-queue structures with per-lcore ones, so that any device
  polled from the same lcore can share data
- Any queue that is going to be polled from a specific lcore has to be
  added to the list of queues to poll, so that the callback is aware of
  other queues being polled by the same lcore
- Both the empty poll counter and the actual power saving mechanism is
  shared between all queues polled on a particular lcore, and is only
  activated when all queues in the list were polled and were determined
  to have no traffic.
- The limitation on UMWAIT-based polling is not removed because UMWAIT
  is incapable of monitoring more than one address.

Also, while we're at it, update and improve the docs.

Signed-off-by: Anatoly Burakov 
---

Notes:
v6:
- Track each individual queue sleep status (Konstantin)
- Fix segfault (Dave)

v5:
- Remove the "power save queue" API and replace it with mechanism suggested 
by
  Konstantin

v3:
- Move the list of supported NICs to NIC feature table

v2:
- Use a TAILQ for queues instead of a static array
- Address feedback from Konstantin
- Add additional checks for stopped queues

 doc/guides/nics/features.rst   |  10 +
 doc/guides/prog_guide/power_man.rst|  65 ++--
 doc/guides/rel_notes/release_21_08.rst |   3 +
 lib/power/rte_power_pmd_mgmt.c | 452 +++--
 4 files changed, 394 insertions(+), 136 deletions(-)

diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index 403c2b03a3..a96e12d155 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information.
 * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``.
 * **[related] API**: ``rte_eth_rx_burst_mode_get()``, 
``rte_eth_tx_burst_mode_get()``.
 
+.. _nic_features_get_monitor_addr:
+
+PMD power management using monitor addresses
+
+
+Supports getting a monitoring condition to use together with Ethernet PMD power
+management (see :doc:`../prog_guide/power_man` for more details).
+
+* **[implements] eth_dev_ops**: ``get_monitor_addr``
+
 .. _nic_features_other:
 
 Other dev ops not represented by a Feature
diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index c70ae128ac..ec04a72108 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -198,34 +198,41 @@ Ethernet PMD Power Management API
 Abstract
 
 
-Existing power management mechanisms require developers
-to change application design or change code to make use of it.
-The PMD power management API provides a convenient alternative
-by utilizing Ethernet PMD RX callbacks,
-and triggering power saving whenever empty poll count reaches a certain number.
-
-Monitor
-   This power saving scheme will put the CPU into optimized power state
-   and use the ``rte_power_monitor()`` function
-   to monitor the Ethernet PMD RX descriptor address,
-   and wake the CPU up whenever there's new traffic.
-
-Pause
-   This power saving scheme will avoid busy polling
-   by either entering power-optimized sleep state
-   with ``rte_power_pause()`` function,
-   or, if it's not available, use ``rte_pause()``.
-
-Frequency scaling
-   This power saving scheme will use ``librte_power`` library
-   functionality to scale the core frequency up/down
-   depending on traffic volume.
-
-.. note::
-
-   Currently, this power management API is limited to mandatory mapping
-   of 1 queue to 1 core (multiple queues are supported,
-   but they must be polled from different cores).
+Existing power management mechanisms require developers to change application
+design or change code to make use of it. The PMD power management API provides 
a
+convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering
+power saving whenever empty poll count reaches a certain number.
+
+* Monitor
+   This power saving scheme will put the CPU into optimized power state and
+   monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever
+   there's new traffic. Support for this scheme may not be available on all
+   platforms, and further limitations may apply (see below).
+
+* Pause
+   This power saving scheme will avoid busy pollin

[dpdk-dev] [PATCH v6 6/7] power: support monitoring multiple Rx queues

2021-07-05 Thread Anatoly Burakov

Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
Rx queues while entering the energy efficient power state. The multi
version will be used unconditionally if supported, and the UMWAIT one
will only be used when multi-monitor is not supported by the hardware.

Signed-off-by: Anatoly Burakov 
---

Notes:
v6:
- Fix the missed feedback from v5

v4:
- Fix possible out of bounds access
- Added missing index increment

 doc/guides/prog_guide/power_man.rst |  9 ++--
 lib/power/rte_power_pmd_mgmt.c  | 82 -
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index ec04a72108..94353ca012 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -221,13 +221,16 @@ power saving whenever empty poll count reaches a certain 
number.
 The "monitor" mode is only supported in the following configurations and 
scenarios:
 
 * If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor_multi()`` function is supported by the platform, then
+  monitoring multiple Ethernet Rx queues for traffic will be supported.
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
   ``rte_power_monitor()`` is supported by the platform, then monitoring will be
   limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
   monitored from a different lcore).
 
-* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
-  ``rte_power_monitor()`` function is not supported, then monitor mode will not
-  be supported.
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of 
the
+  two monitoring functions are supported, then monitor mode will not be 
supported.
 
 * Not all Ethernet drivers support monitoring, even if the underlying
   platform may support the necessary CPU instructions. Please refer to
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index 9ffeda05ed..0c45469619 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -126,6 +126,32 @@ queue_list_take(struct pmd_core_cfg *cfg, const union 
queue *q)
return found;
 }
 
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+   struct rte_power_monitor_cond *pmc, size_t len)
+{
+   const struct queue_list_entry *qle;
+   size_t i = 0;
+   int ret;
+
+   TAILQ_FOREACH(qle, &cfg->head, next) {
+   const union queue *q = &qle->queue;
+   struct rte_power_monitor_cond *cur;
+
+   /* attempted out of bounds access */
+   if (i >= len) {
+   RTE_LOG(ERR, POWER, "Too many queues being 
monitored\n");
+   return -1;
+   }
+
+   cur = &pmc[i++];
+   ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+   if (ret < 0)
+   return ret;
+   }
+   return 0;
+}
+
 static void
 calc_tsc(void)
 {
@@ -211,6 +237,46 @@ lcore_can_sleep(struct pmd_core_cfg *cfg)
return true;
 }
 
+static uint16_t
+clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
+   struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+   uint16_t max_pkts __rte_unused, void *arg)
+{
+   const unsigned int lcore = rte_lcore_id();
+   struct queue_list_entry *queue_conf = arg;
+   struct pmd_core_cfg *lcore_conf;
+   const bool empty = nb_rx == 0;
+
+   lcore_conf = &lcore_cfgs[lcore];
+
+   /* early exit */
+   if (likely(!empty))
+   /* early exit */
+   queue_reset(lcore_conf, queue_conf);
+   else {
+   struct rte_power_monitor_cond pmc[lcore_conf->n_queues];
+   int ret;
+
+   /* can this queue sleep? */
+   if (!queue_can_sleep(lcore_conf, queue_conf))
+   return nb_rx;
+
+   /* can this lcore sleep? */
+   if (!lcore_can_sleep(lcore_conf))
+   return nb_rx;
+
+   /* gather all monitoring conditions */
+   ret = get_monitor_addresses(lcore_conf, pmc,
+   lcore_conf->n_queues);
+   if (ret < 0)
+   return nb_rx;
+
+   rte_power_monitor_multi(pmc, lcore_conf->n_queues, UINT64_MAX);
+   }
+
+   return nb_rx;
+}
+
 static uint16_t
 clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf **pkts 
__rte_unused,
uint16_t nb_rx, uint16_t max_pkts __rte_unused, void *arg)
@@ -362,14 +428,19 @@ static int
 check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata)
 {
struct rte_power_monitor_cond dummy;
+   bool multimonitor_supported;

[dpdk-dev] [PATCH v6 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes

2021-07-05 Thread Anatoly Burakov

Currently, l3fwd-power enforces the limitation of having one queue per
lcore. This is no longer necessary, so remove the limitation.

Signed-off-by: Anatoly Burakov 
---
 examples/l3fwd-power/main.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f8dfed1634..52f56dc405 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -2723,12 +2723,6 @@ main(int argc, char **argv)
printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
fflush(stdout);
 
-   /* PMD power management mode can only do 1 queue per core */
-   if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) {
-   rte_exit(EXIT_FAILURE,
-   "In PMD power management mode, only one queue 
per lcore is allowed\n");
-   }
-
/* init RX queues */
for(queue = 0; queue < qconf->n_rx_queue; ++queue) {
struct rte_eth_rxconf rxq_conf;
-- 
2.25.1

[dpdk-dev] [PATCH v7 0/7] Enhancements for PMD power management

2021-07-07 Thread Anatoly Burakov

This patchset introduces several changes related to PMD power management:

- Changed monitoring intrinsics to use callbacks as a comparison function, based
  on previous patchset [1] but incorporating feedback [2] - this hopefully will
  make it possible to add support for .get_monitor_addr in virtio
- Add a new intrinsic to monitor multiple addresses, based on RTM instruction
  set and the TPAUSE instruction
- Add support for PMD power management on multiple queues, as well as all
  accompanying infrastructure and example apps changes

v7:
- Fixed various bugs

v6:
- Improved the algorithm for multi-queue sleep
- Fixed segfault and addressed other feedback

v5:
- Removed "power save queue" API and replaced with mechanism suggested by
  Konstantin
- Addressed other feedback

v4:
- Replaced raw number with a macro
- Fixed all the bugs found by Konstantin
- Some other minor corrections

v3:
- Moved some doc updates to NIC features list

v2:
- Changed check inversion to callbacks
- Addressed feedback from Konstantin
- Added doc updates where necessary

[1] http://patches.dpdk.org/project/dpdk/list/?series=16930&state=*
[2] 
http://patches.dpdk.org/project/dpdk/patch/819ef1ace187365a615d3383e54579e3d9fb216e.1620747068.git.anatoly.bura...@intel.com/#133274

Anatoly Burakov (7):
  power_intrinsics: use callbacks for comparison
  net/af_xdp: add power monitor support
  eal: add power monitor for multiple events
  power: remove thread safety from PMD power API's
  power: support callbacks for multiple Rx queues
  power: support monitoring multiple Rx queues
  l3fwd-power: support multiqueue in PMD pmgmt modes

 doc/guides/nics/features.rst  |  10 +
 doc/guides/prog_guide/power_man.rst   |  74 +-
 doc/guides/rel_notes/release_21_08.rst|   9 +
 drivers/event/dlb2/dlb2.c |  17 +-
 drivers/net/af_xdp/rte_eth_af_xdp.c   |  34 +
 drivers/net/i40e/i40e_rxtx.c  |  20 +-
 drivers/net/iavf/iavf_rxtx.c  |  20 +-
 drivers/net/ice/ice_rxtx.c|  20 +-
 drivers/net/ixgbe/ixgbe_rxtx.c|  20 +-
 drivers/net/mlx5/mlx5_rx.c|  17 +-
 examples/l3fwd-power/main.c   |   6 -
 lib/eal/arm/rte_power_intrinsics.c|  11 +
 lib/eal/include/generic/rte_cpuflags.h|   2 +
 .../include/generic/rte_power_intrinsics.h|  68 +-
 lib/eal/ppc/rte_power_intrinsics.c|  11 +
 lib/eal/version.map   |   3 +
 lib/eal/x86/rte_cpuflags.c|   2 +
 lib/eal/x86/rte_power_intrinsics.c|  90 ++-
 lib/power/meson.build |   3 +
 lib/power/rte_power_pmd_mgmt.c| 659 +-
 lib/power/rte_power_pmd_mgmt.h|   6 +
 21 files changed, 840 insertions(+), 262 deletions(-)

-- 
2.25.1

[dpdk-dev] [PATCH v7 1/7] power_intrinsics: use callbacks for comparison

2021-07-07 Thread Anatoly Burakov

Previously, the semantics of power monitor were such that we were
checking current value against the expected value, and if they matched,
then the sleep was aborted. This is somewhat inflexible, because it only
allowed us to check for a specific value in a specific way.

This commit replaces the comparison with a user callback mechanism, so
that any PMD (or other code) using `rte_power_monitor()` can define
their own comparison semantics and decision making on how to detect the
need to abort the entering of power optimized state.

Existing implementations are adjusted to follow the new semantics.

Suggested-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
Acked-by: Konstantin Ananyev 
---

Notes:
v4:
- Return error if callback is set to NULL
- Replace raw number with a macro in monitor condition opaque data

v2:
- Use callback mechanism for more flexibility
- Address feedback from Konstantin

 doc/guides/rel_notes/release_21_08.rst|  2 ++
 drivers/event/dlb2/dlb2.c | 17 --
 drivers/net/i40e/i40e_rxtx.c  | 20 +++
 drivers/net/iavf/iavf_rxtx.c  | 20 +++
 drivers/net/ice/ice_rxtx.c| 20 +++
 drivers/net/ixgbe/ixgbe_rxtx.c| 20 +++
 drivers/net/mlx5/mlx5_rx.c| 17 --
 .../include/generic/rte_power_intrinsics.h| 33 +++
 lib/eal/x86/rte_power_intrinsics.c| 17 +-
 9 files changed, 122 insertions(+), 44 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index cd02820e68..c1d063bb11 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -117,6 +117,8 @@ API Changes
 * eal: ``rte_strscpy`` sets ``rte_errno`` to ``E2BIG`` in case of string
   truncation.
 
+* eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
+
 
 ABI Changes
 ---
diff --git a/drivers/event/dlb2/dlb2.c b/drivers/event/dlb2/dlb2.c
index eca183753f..252bbd8d5e 100644
--- a/drivers/event/dlb2/dlb2.c
+++ b/drivers/event/dlb2/dlb2.c
@@ -3154,6 +3154,16 @@ dlb2_port_credits_inc(struct dlb2_port *qm_port, int num)
}
 }
 
+#define CLB_MASK_IDX 0
+#define CLB_VAL_IDX 1
+static int
+dlb2_monitor_callback(const uint64_t val,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   /* abort if the value matches */
+   return (val & opaque[CLB_MASK_IDX]) == opaque[CLB_VAL_IDX] ? -1 : 0;
+}
+
 static inline int
 dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
  struct dlb2_eventdev_port *ev_port,
@@ -3194,8 +3204,11 @@ dlb2_dequeue_wait(struct dlb2_eventdev *dlb2,
expected_value = 0;
 
pmc.addr = monitor_addr;
-   pmc.val = expected_value;
-   pmc.mask = qe_mask.raw_qe[1];
+   /* store expected value and comparison mask in opaque data */
+   pmc.opaque[CLB_VAL_IDX] = expected_value;
+   pmc.opaque[CLB_MASK_IDX] = qe_mask.raw_qe[1];
+   /* set up callback */
+   pmc.fn = dlb2_monitor_callback;
pmc.size = sizeof(uint64_t);
 
rte_power_monitor(&pmc, timeout + start_ticks);
diff --git a/drivers/net/i40e/i40e_rxtx.c b/drivers/net/i40e/i40e_rxtx.c
index 8d65f287f4..65f325ede1 100644
--- a/drivers/net/i40e/i40e_rxtx.c
+++ b/drivers/net/i40e/i40e_rxtx.c
@@ -81,6 +81,18 @@
 #define I40E_TX_OFFLOAD_SIMPLE_NOTSUP_MASK \
(PKT_TX_OFFLOAD_MASK ^ I40E_TX_OFFLOAD_SIMPLE_SUP_MASK)
 
+static int
+i40e_monitor_callback(const uint64_t value,
+   const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ] __rte_unused)
+{
+   const uint64_t m = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /*
+* we expect the DD bit to be set to 1 if this descriptor was already
+* written to.
+*/
+   return (value & m) == m ? -1 : 0;
+}
+
 int
 i40e_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
 {
@@ -93,12 +105,8 @@ i40e_get_monitor_addr(void *rx_queue, struct 
rte_power_monitor_cond *pmc)
/* watch for changes in status bit */
pmc->addr = &rxdp->wb.qword1.status_error_len;
 
-   /*
-* we expect the DD bit to be set to 1 if this descriptor was already
-* written to.
-*/
-   pmc->val = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
-   pmc->mask = rte_cpu_to_le_64(1 << I40E_RX_DESC_STATUS_DD_SHIFT);
+   /* comparison callback */
+   pmc->fn = i40e_monitor_callback;
 
/* registers are 64-bit */
pmc->size = sizeof(uint64_t);
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index f817fbc49b..d61b32fcee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -57,6 +57,

[dpdk-dev] [PATCH v7 2/7] net/af_xdp: add power monitor support

2021-07-07 Thread Anatoly Burakov

Implement support for .get_monitor_addr in AF_XDP driver.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Rewrite using the callback mechanism

 drivers/net/af_xdp/rte_eth_af_xdp.c | 34 +
 1 file changed, 34 insertions(+)

diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c
index eb5660a3dc..7830d0c23a 100644
--- a/drivers/net/af_xdp/rte_eth_af_xdp.c
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "compat.h"
 
@@ -788,6 +789,38 @@ eth_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+#define CLB_VAL_IDX 0
+static int
+eth_monitor_callback(const uint64_t value,
+   const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   const uint64_t v = opaque[CLB_VAL_IDX];
+   const uint64_t m = (uint32_t)~0;
+
+   /* if the value has changed, abort entering power optimized state */
+   return (value & m) == v ? 0 : -1;
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+   struct pkt_rx_queue *rxq = rx_queue;
+   unsigned int *prod = rxq->rx.producer;
+   const uint32_t cur_val = rxq->rx.cached_prod; /* use cached value */
+
+   /* watch for changes in producer ring */
+   pmc->addr = (void*)prod;
+
+   /* store current value */
+   pmc->opaque[CLB_VAL_IDX] = cur_val;
+   pmc->fn = eth_monitor_callback;
+
+   /* AF_XDP producer ring index is 32-bit */
+   pmc->size = sizeof(uint32_t);
+
+   return 0;
+}
+
 static int
 eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
@@ -1448,6 +1481,7 @@ static const struct eth_dev_ops ops = {
.link_update = eth_link_update,
.stats_get = eth_stats_get,
.stats_reset = eth_stats_reset,
+   .get_monitor_addr = eth_get_monitor_addr
 };
 
 /** parse busy_budget argument */
-- 
2.25.1

[dpdk-dev] [PATCH v7 3/7] eal: add power monitor for multiple events

2021-07-07 Thread Anatoly Burakov

Use RTM and WAITPKG instructions to perform a wait-for-writes similar to
what UMWAIT does, but without the limitation of having to listen for
just one event. This works because the optimized power state used by the
TPAUSE instruction will cause a wake up on RTM transaction abort, so if
we add the addresses we're interested in to the read-set, any write to
those addresses will wake us up.

Signed-off-by: Konstantin Ananyev 
Signed-off-by: Anatoly Burakov 
---

Notes:
v4:
- Fixed bugs in accessing the monitor condition
- Abort on any monitor condition not having a defined callback

v2:
- Adapt to callback mechanism

 lib/eal/arm/rte_power_intrinsics.c| 11 +++
 lib/eal/include/generic/rte_cpuflags.h|  2 +
 .../include/generic/rte_power_intrinsics.h| 35 +
 lib/eal/ppc/rte_power_intrinsics.c| 11 +++
 lib/eal/version.map   |  3 +
 lib/eal/x86/rte_cpuflags.c|  2 +
 lib/eal/x86/rte_power_intrinsics.c| 73 +++
 7 files changed, 137 insertions(+)

diff --git a/lib/eal/arm/rte_power_intrinsics.c 
b/lib/eal/arm/rte_power_intrinsics.c
index e83f04072a..78f55b7203 100644
--- a/lib/eal/arm/rte_power_intrinsics.c
+++ b/lib/eal/arm/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+   RTE_SET_USED(tsc_timestamp);
+
+   return -ENOTSUP;
+}
diff --git a/lib/eal/include/generic/rte_cpuflags.h 
b/lib/eal/include/generic/rte_cpuflags.h
index 28a5aecde8..d35551e931 100644
--- a/lib/eal/include/generic/rte_cpuflags.h
+++ b/lib/eal/include/generic/rte_cpuflags.h
@@ -24,6 +24,8 @@ struct rte_cpu_intrinsics {
/**< indicates support for rte_power_monitor function */
uint32_t power_pause : 1;
/**< indicates support for rte_power_pause function */
+   uint32_t power_monitor_multi : 1;
+   /**< indicates support for rte_power_monitor_multi function */
 };
 
 /**
diff --git a/lib/eal/include/generic/rte_power_intrinsics.h 
b/lib/eal/include/generic/rte_power_intrinsics.h
index c9aa52a86d..04e8c2ab37 100644
--- a/lib/eal/include/generic/rte_power_intrinsics.h
+++ b/lib/eal/include/generic/rte_power_intrinsics.h
@@ -128,4 +128,39 @@ int rte_power_monitor_wakeup(const unsigned int lcore_id);
 __rte_experimental
 int rte_power_pause(const uint64_t tsc_timestamp);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Monitor a set of addresses for changes. This will cause the CPU to enter an
+ * architecture-defined optimized power state until either one of the specified
+ * memory addresses is written to, a certain TSC timestamp is reached, or other
+ * reasons cause the CPU to wake up.
+ *
+ * Additionally, `expected` 64-bit values and 64-bit masks are provided. If
+ * mask is non-zero, the current value pointed to by the `p` pointer will be
+ * checked against the expected value, and if they do not match, the entering 
of
+ * optimized power state may be aborted.
+ *
+ * @warning It is responsibility of the user to check if this function is
+ *   supported at runtime using `rte_cpu_get_intrinsics_support()` API call.
+ *   Failing to do so may result in an illegal CPU instruction error.
+ *
+ * @param pmc
+ *   An array of monitoring condition structures.
+ * @param num
+ *   Length of the `pmc` array.
+ * @param tsc_timestamp
+ *   Maximum TSC timestamp to wait for. Note that the wait behavior is
+ *   architecture-dependent.
+ *
+ * @return
+ *   0 on success
+ *   -EINVAL on invalid parameters
+ *   -ENOTSUP if unsupported
+ */
+__rte_experimental
+int rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp);
+
 #endif /* _RTE_POWER_INTRINSIC_H_ */
diff --git a/lib/eal/ppc/rte_power_intrinsics.c 
b/lib/eal/ppc/rte_power_intrinsics.c
index 7fc9586da7..f00b58ade5 100644
--- a/lib/eal/ppc/rte_power_intrinsics.c
+++ b/lib/eal/ppc/rte_power_intrinsics.c
@@ -38,3 +38,14 @@ rte_power_monitor_wakeup(const unsigned int lcore_id)
 
return -ENOTSUP;
 }
+
+int
+rte_power_monitor_multi(const struct rte_power_monitor_cond pmc[],
+   const uint32_t num, const uint64_t tsc_timestamp)
+{
+   RTE_SET_USED(pmc);
+   RTE_SET_USED(num);
+   RTE_SET_USED(tsc_timestamp);
+
+   return -ENOTSUP;
+}
diff --git a/lib/eal/version.map b/lib/eal/version.map
index fe5c3dac98..4ccd5475d6 100644
--- a/lib/eal/version.map
+++ b/lib/eal/version.map
@@ -423,6 +423,9 @@ EXPERIMENTAL {
rte_version_release; # WINDOWS_NO_EXPORT
rte_version_suffix; # WINDOWS_NO_EXPORT
rte_version_year; # WINDOWS_NO_EXPORT
+
+   # added in 21.08
+   rte_power

[dpdk-dev] [PATCH v7 4/7] power: remove thread safety from PMD power API's

2021-07-07 Thread Anatoly Burakov

Currently, we expect that only one callback can be active at any given
moment, for a particular queue configuration, which is relatively easy
to implement in a thread-safe way. However, we're about to add support
for multiple queues per lcore, which will greatly increase the
possibility of various race conditions.

We could have used something like an RCU for this use case, but absent
of a pressing need for thread safety we'll go the easy way and just
mandate that the API's are to be called when all affected ports are
stopped, and document this limitation. This greatly simplifies the
`rte_power_monitor`-related code.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2:
- Add check for stopped queue
- Clarified doc message
- Added release notes

 doc/guides/rel_notes/release_21_08.rst |   4 +
 lib/power/meson.build  |   3 +
 lib/power/rte_power_pmd_mgmt.c | 133 ++---
 lib/power/rte_power_pmd_mgmt.h |   6 ++
 4 files changed, 66 insertions(+), 80 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index c1d063bb11..4b84c89c0b 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -119,6 +119,10 @@ API Changes
 
 * eal: the ``rte_power_intrinsics`` API changed to use a callback mechanism.
 
+* rte_power: The experimental PMD power management API is no longer considered
+  to be thread safe; all Rx queues affected by the API will now need to be
+  stopped before making any changes to the power management scheme.
+
 
 ABI Changes
 ---
diff --git a/lib/power/meson.build b/lib/power/meson.build
index c1097d32f1..4f6a242364 100644
--- a/lib/power/meson.build
+++ b/lib/power/meson.build
@@ -21,4 +21,7 @@ headers = files(
 'rte_power_pmd_mgmt.h',
 'rte_power_guest_channel.h',
 )
+if cc.has_argument('-Wno-cast-qual')
+cflags += '-Wno-cast-qual'
+endif
 deps += ['timer', 'ethdev']
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index db03cbf420..9b95cf1794 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -40,8 +40,6 @@ struct pmd_queue_cfg {
/**< Callback mode for this queue */
const struct rte_eth_rxtx_callback *cur_cb;
/**< Callback instance */
-   volatile bool umwait_in_progress;
-   /**< are we currently sleeping? */
uint64_t empty_poll_stats;
/**< Number of empty polls */
 } __rte_cache_aligned;
@@ -92,30 +90,11 @@ clb_umwait(uint16_t port_id, uint16_t qidx, struct rte_mbuf 
**pkts __rte_unused,
struct rte_power_monitor_cond pmc;
uint16_t ret;
 
-   /*
-* we might get a cancellation request while being
-* inside the callback, in which case the wakeup
-* wouldn't work because it would've arrived too early.
-*
-* to get around this, we notify the other thread that
-* we're sleeping, so that it can spin until we're done.
-* unsolicited wakeups are perfectly safe.
-*/
-   q_conf->umwait_in_progress = true;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
-
-   /* check if we need to cancel sleep */
-   if (q_conf->pwr_mgmt_state == PMD_MGMT_ENABLED) {
-   /* use monitoring condition to sleep */
-   ret = rte_eth_get_monitor_addr(port_id, qidx,
-   &pmc);
-   if (ret == 0)
-   rte_power_monitor(&pmc, UINT64_MAX);
-   }
-   q_conf->umwait_in_progress = false;
-
-   rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
+   /* use monitoring condition to sleep */
+   ret = rte_eth_get_monitor_addr(port_id, qidx,
+   &pmc);
+   if (ret == 0)
+   rte_power_monitor(&pmc, UINT64_MAX);
}
} else
q_conf->empty_poll_stats = 0;
@@ -177,12 +156,24 @@ clb_scale_freq(uint16_t port_id, uint16_t qidx,
return nb_rx;
 }
 
+static int
+queue_stopped(const uint16_t port_id, const uint16_t queue_id)
+{
+   struct rte_eth_rxq_info qinfo;
+
+   if (rte_eth_rx_queue_info_get(port_id, queue_id, &qinfo) < 0)
+   return -1;
+
+   return qinfo.queue_state == RTE_ETH_QUEUE_STATE_STOPPED;
+}
+
 int
 rte_power_ethdev_pmgmt_queue_enable(unsigned int lcore_id, uint

[dpdk-dev] [PATCH v7 5/7] power: support callbacks for multiple Rx queues

2021-07-07 Thread Anatoly Burakov

Currently, there is a hard limitation on the PMD power management
support that only allows it to support a single queue per lcore. This is
not ideal as most DPDK use cases will poll multiple queues per core.

The PMD power management mechanism relies on ethdev Rx callbacks, so it
is very difficult to implement such support because callbacks are
effectively stateless and have no visibility into what the other ethdev
devices are doing. This places limitations on what we can do within the
framework of Rx callbacks, but the basics of this implementation are as
follows:

- Replace per-queue structures with per-lcore ones, so that any device
  polled from the same lcore can share data
- Any queue that is going to be polled from a specific lcore has to be
  added to the list of queues to poll, so that the callback is aware of
  other queues being polled by the same lcore
- Both the empty poll counter and the actual power saving mechanism is
  shared between all queues polled on a particular lcore, and is only
  activated when all queues in the list were polled and were determined
  to have no traffic.
- The limitation on UMWAIT-based polling is not removed because UMWAIT
  is incapable of monitoring more than one address.

Also, while we're at it, update and improve the docs.

Signed-off-by: Anatoly Burakov 
---

Notes:
v7:
- Fix bug where initial sleep target was always set to zero
- Fix logic in handling of n_queues_ready_to_sleep counter
- Update documentation on hardware requirements

v6:
- Track each individual queue sleep status (Konstantin)
- Fix segfault (Dave)

v5:
- Remove the "power save queue" API and replace it with mechanism suggested 
by
  Konstantin

v3:
- Move the list of supported NICs to NIC feature table

v2:
- Use a TAILQ for queues instead of a static array
- Address feedback from Konstantin
- Add additional checks for stopped queues

 doc/guides/nics/features.rst   |  10 +
 doc/guides/prog_guide/power_man.rst|  69 ++--
 doc/guides/rel_notes/release_21_08.rst |   3 +
 lib/power/rte_power_pmd_mgmt.c | 456 +++--
 4 files changed, 402 insertions(+), 136 deletions(-)

diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index 403c2b03a3..a96e12d155 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -912,6 +912,16 @@ Supports to get Rx/Tx packet burst mode information.
 * **[implements] eth_dev_ops**: ``rx_burst_mode_get``, ``tx_burst_mode_get``.
 * **[related] API**: ``rte_eth_rx_burst_mode_get()``, 
``rte_eth_tx_burst_mode_get()``.
 
+.. _nic_features_get_monitor_addr:
+
+PMD power management using monitor addresses
+
+
+Supports getting a monitoring condition to use together with Ethernet PMD power
+management (see :doc:`../prog_guide/power_man` for more details).
+
+* **[implements] eth_dev_ops**: ``get_monitor_addr``
+
 .. _nic_features_other:
 
 Other dev ops not represented by a Feature
diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index c70ae128ac..0e66878892 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -198,34 +198,45 @@ Ethernet PMD Power Management API
 Abstract
 
 
-Existing power management mechanisms require developers
-to change application design or change code to make use of it.
-The PMD power management API provides a convenient alternative
-by utilizing Ethernet PMD RX callbacks,
-and triggering power saving whenever empty poll count reaches a certain number.
-
-Monitor
-   This power saving scheme will put the CPU into optimized power state
-   and use the ``rte_power_monitor()`` function
-   to monitor the Ethernet PMD RX descriptor address,
-   and wake the CPU up whenever there's new traffic.
-
-Pause
-   This power saving scheme will avoid busy polling
-   by either entering power-optimized sleep state
-   with ``rte_power_pause()`` function,
-   or, if it's not available, use ``rte_pause()``.
-
-Frequency scaling
-   This power saving scheme will use ``librte_power`` library
-   functionality to scale the core frequency up/down
-   depending on traffic volume.
-
-.. note::
-
-   Currently, this power management API is limited to mandatory mapping
-   of 1 queue to 1 core (multiple queues are supported,
-   but they must be polled from different cores).
+Existing power management mechanisms require developers to change application
+design or change code to make use of it. The PMD power management API provides 
a
+convenient alternative by utilizing Ethernet PMD RX callbacks, and triggering
+power saving whenever empty poll count reaches a certain number.
+
+* Monitor
+   This power saving scheme will put the CPU into optimized power state and
+   monitor the Ethernet PMD RX descriptor address, waking the CPU up whenever
+   there's

[dpdk-dev] [PATCH v7 6/7] power: support monitoring multiple Rx queues

2021-07-07 Thread Anatoly Burakov

Use the new multi-monitor intrinsic to allow monitoring multiple ethdev
Rx queues while entering the energy efficient power state. The multi
version will be used unconditionally if supported, and the UMWAIT one
will only be used when multi-monitor is not supported by the hardware.

Signed-off-by: Anatoly Burakov 
---

Notes:
v6:
- Fix the missed feedback from v5

v4:
- Fix possible out of bounds access
- Added missing index increment

 doc/guides/prog_guide/power_man.rst | 15 --
 lib/power/rte_power_pmd_mgmt.c  | 82 -
 2 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index 0e66878892..e387d7811e 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -221,17 +221,22 @@ power saving whenever empty poll count reaches a certain 
number.
 The "monitor" mode is only supported in the following configurations and 
scenarios:
 
 * On Linux* x86_64, `rte_power_monitor()` requires WAITPKG instruction set 
being
-  supported by the CPU. Please refer to your platform documentation for further
-  information.
+  supported by the CPU, while `rte_power_monitor_multi()` requires WAITPKG and
+  RTM instruction sets being supported by the CPU. RTM instruction set may also
+  require booting the Linux with `tsx=on` command line parameter. Please refer
+  to your platform documentation for further information.
 
 * If ``rte_cpu_get_intrinsics_support()`` function indicates that
+  ``rte_power_monitor_multi()`` function is supported by the platform, then
+  monitoring multiple Ethernet Rx queues for traffic will be supported.
+
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that only
   ``rte_power_monitor()`` is supported by the platform, then monitoring will be
   limited to a mapping of 1 core 1 queue (thus, each Rx queue will have to be
   monitored from a different lcore).
 
-* If ``rte_cpu_get_intrinsics_support()`` function indicates that the
-  ``rte_power_monitor()`` function is not supported, then monitor mode will not
-  be supported.
+* If ``rte_cpu_get_intrinsics_support()`` function indicates that neither of 
the
+  two monitoring functions are supported, then monitor mode will not be 
supported.
 
 * Not all Ethernet drivers support monitoring, even if the underlying
   platform may support the necessary CPU instructions. Please refer to
diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c
index ceaf386d2b..ba5971f827 100644
--- a/lib/power/rte_power_pmd_mgmt.c
+++ b/lib/power/rte_power_pmd_mgmt.c
@@ -126,6 +126,32 @@ queue_list_take(struct pmd_core_cfg *cfg, const union 
queue *q)
return found;
 }
 
+static inline int
+get_monitor_addresses(struct pmd_core_cfg *cfg,
+   struct rte_power_monitor_cond *pmc, size_t len)
+{
+   const struct queue_list_entry *qle;
+   size_t i = 0;
+   int ret;
+
+   TAILQ_FOREACH(qle, &cfg->head, next) {
+   const union queue *q = &qle->queue;
+   struct rte_power_monitor_cond *cur;
+
+   /* attempted out of bounds access */
+   if (i >= len) {
+   RTE_LOG(ERR, POWER, "Too many queues being 
monitored\n");
+   return -1;
+   }
+
+   cur = &pmc[i++];
+   ret = rte_eth_get_monitor_addr(q->portid, q->qid, cur);
+   if (ret < 0)
+   return ret;
+   }
+   return 0;
+}
+
 static void
 calc_tsc(void)
 {
@@ -211,6 +237,46 @@ lcore_can_sleep(struct pmd_core_cfg *cfg)
return true;
 }
 
+static uint16_t
+clb_multiwait(uint16_t port_id __rte_unused, uint16_t qidx __rte_unused,
+   struct rte_mbuf **pkts __rte_unused, uint16_t nb_rx,
+   uint16_t max_pkts __rte_unused, void *arg)
+{
+   const unsigned int lcore = rte_lcore_id();
+   struct queue_list_entry *queue_conf = arg;
+   struct pmd_core_cfg *lcore_conf;
+   const bool empty = nb_rx == 0;
+
+   lcore_conf = &lcore_cfgs[lcore];
+
+   /* early exit */
+   if (likely(!empty))
+   /* early exit */
+   queue_reset(lcore_conf, queue_conf);
+   else {
+   struct rte_power_monitor_cond pmc[lcore_conf->n_queues];
+   int ret;
+
+   /* can this queue sleep? */
+   if (!queue_can_sleep(lcore_conf, queue_conf))
+   return nb_rx;
+
+   /* can this lcore sleep? */
+   if (!lcore_can_sleep(lcore_conf))
+   return nb_rx;
+
+   /* gather all monitoring conditions */
+   ret = get_monitor_addresses(lcore_conf, pmc,
+   lcore_conf->n_queues);
+   if (ret < 0)
+   return nb_rx;
+
+

[dpdk-dev] [PATCH v7 7/7] l3fwd-power: support multiqueue in PMD pmgmt modes

2021-07-07 Thread Anatoly Burakov

Currently, l3fwd-power enforces the limitation of having one queue per
lcore. This is no longer necessary, so remove the limitation.

Signed-off-by: Anatoly Burakov 
---
 examples/l3fwd-power/main.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/examples/l3fwd-power/main.c b/examples/l3fwd-power/main.c
index f8dfed1634..52f56dc405 100644
--- a/examples/l3fwd-power/main.c
+++ b/examples/l3fwd-power/main.c
@@ -2723,12 +2723,6 @@ main(int argc, char **argv)
printf("\nInitializing rx queues on lcore %u ... ", lcore_id );
fflush(stdout);
 
-   /* PMD power management mode can only do 1 queue per core */
-   if (app_mode == APP_MODE_PMD_MGMT && qconf->n_rx_queue > 1) {
-   rte_exit(EXIT_FAILURE,
-   "In PMD power management mode, only one queue 
per lcore is allowed\n");
-   }
-
/* init RX queues */
for(queue = 0; queue < qconf->n_rx_queue; ++queue) {
struct rte_eth_rxconf rxq_conf;
-- 
2.25.1

[PATCH v1 1/1] malloc/mp: fix wait condition handling

2024-07-12 Thread Anatoly Burakov

>From coverity's point of view, it is theoretically possible to have an
infinite wait on a wait condition because while we do check for timeout,
we do not check for whether the event we are waiting for has already
occurred by the time we get to the first cond_wait call (in this case,
it's state of memory request list entry's state being set to COMPLETE).

This can't really happen as the only time a wait condition is triggered
is when we are receiving a memory event (so the entry we are waiting on
cannot change before wait condition is triggered because it's protected
by a mutex), so either we receive an event and modify entry state, or we
exit wait on a timeout and do not care about request state. However, it's
better to keep coverity happy.

Coverity issue: 425709
Fixes: 07dcbfe0101f ("malloc: support multiprocess memory hotplug")
Cc: sta...@dpdk.org

Signed-off-by: Anatoly Burakov 
---
 lib/eal/common/malloc_mp.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/eal/common/malloc_mp.c b/lib/eal/common/malloc_mp.c
index 2d39b0716f..9765277f5d 100644
--- a/lib/eal/common/malloc_mp.c
+++ b/lib/eal/common/malloc_mp.c
@@ -756,7 +756,8 @@ request_to_primary(struct malloc_mp_req *user_req)
do {
ret = pthread_cond_timedwait(&entry->cond,
&mp_request_list.lock, &ts);
-   } while (ret != 0 && ret != ETIMEDOUT);
+   } while ((ret != 0 && ret != ETIMEDOUT) &&
+   entry->state == REQ_STATE_ACTIVE);
 
if (entry->state != REQ_STATE_COMPLETE) {
EAL_LOG(ERR, "Request timed out");
-- 
2.43.0

[PATCH v1 1/1] net/ice: fix E830 PTP phy model

2024-07-23 Thread Anatoly Burakov

Currently, we manually set PHY model in `ice_dev_init`, however we missed
adding case for E830, so for E830 the initialization ends up calling E822
code instead. This results in incorrect phy model being set and having
several downstream consequences for E830 as a result, ranging from a
stray error message from attempting to start PHY timer, and up to
inability to enable timesync on E830 devices.

We could've fixed it by adding a case for E830, however there are several
other missing bits of initialization (such as `phy_ports` field). All of
this can be fixed by replacing manual setting of `phy_model` with a call
to `ice_ptp_init_phy_model()`, which calls into base code and initializes
the fields appropriately for all device types, including another option
that is missing from current implementation - ETH56G.

Fixes: c3bedb7114f2 ("net/ice/base: add E830 PTP initialization")

Signed-off-by: Anatoly Burakov 
---
 drivers/net/ice/ice_ethdev.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
index 29509b4329..304f959b7e 100644
--- a/drivers/net/ice/ice_ethdev.c
+++ b/drivers/net/ice/ice_ethdev.c
@@ -2496,10 +2496,8 @@ ice_dev_init(struct rte_eth_dev *dev)
/* Initialize TM configuration */
ice_tm_conf_init(dev);
 
-   if (ice_is_e810(hw))
-   hw->phy_model = ICE_PHY_E810;
-   else
-   hw->phy_model = ICE_PHY_E822;
+   /* Initialize PHY model */
+   ice_ptp_init_phy_model(hw);
 
if (hw->phy_model == ICE_PHY_E822) {
ret = ice_start_phy_timer_e822(hw, hw->pf_id);
-- 
2.43.5

[RFC PATCH v1 0/1] Add Visual Studio Code configuration script

2024-07-26 Thread Anatoly Burakov

Lots of developers (myself included) uses Visual Studio Code as their primary
IDE for DPDK development. I have been successfully using various incarnations
of this script internally to quickly set up my development trees whenever I
need a new configuration, so this script is being shared in hopes that it will
be useful both to new developers starting with DPDK, and to seasoned DPDK
developers who are already using Visual Studio Code. It makes starting working
on DPDK in Visual Studio Code so much easier!

Philosophy behind this script is as follows:

- The assumption is made that a developer will not be using wildly different
  configurations from build to build - usually, they build the same things,
  work with the same set of apps/drivers for a while, then switch to something
  else, at which point a new configuration is needed
- Some configurations I consider to be "common" are included: debug build, debug
  optimized build, release build with docs, and ASan build
  (feel free to make suggestions here!)
- By default, the script will suggest enabling test, testpmd, and helloworld 
example
- No drivers are being enabled by default - use needs to explicitly enable them
  (another option could be to leave things as default and build everything, but 
I
  rather prefer minimalistic builds as they're faster to compile, and it would 
be
  semantically weird to not have any drivers selected yet all of them being 
built)
- All parameters that can be adjusted by TUI are also available as command line
  arguments, so while user interaction is the default (using whiptail), it's
  actually not required and can be bypassed.
- I usually work as a local user not as root, so by default the script will 
attempt
  to use "gdbsudo" (a "sudo gdb $@" script in /usr/local/bin) for launch tasks,
  and stop if it is not available.

Currently, it is only possible to define custom per-build configurations, while
any "global" meson settings would have to involve editing settings.json file. 
This
can be changed easily if required, but I've never needed this functionality.

Please feel free to make any suggestions!

Anatoly Burakov (1):
  devtools: add vscode configuration generator

 devtools/gen-vscode-config.py | 640 ++
 1 file changed, 640 insertions(+)
 create mode 100755 devtools/gen-vscode-config.py

-- 
2.43.5

[RFC PATCH v1 1/1] devtools: add vscode configuration generator

2024-07-26 Thread Anatoly Burakov

A lot of developers use Visual Studio Code as their primary IDE. This
script generates a configuration file for VSCode that sets up basic build
tasks, launch tasks, as well as C/C++ code analysis settings that will
take into account compile_commands.json that is automatically generated
by meson.

Files generated by script:
 - .vscode/settings.json: stores variables needed by other files
 - .vscode/tasks.json: defines build tasks
 - .vscode/launch.json: defines launch tasks
 - .vscode/c_cpp_properties.json: defines code analysis settings

The script uses a combination of globbing and meson file parsing to
discover available apps, examples, and drivers, and generates a
project-wide settings file, so that the user can later switch between
debug/release/etc. configurations while keeping their desired apps,
examples, and drivers, built by meson, and ensuring launch configurations
still work correctly whatever the configuration selected.

This script uses whiptail as TUI, which is expected to be universally
available as it is shipped by default on most major distributions.
However, the script is also designed to be scriptable and can be run
without user interaction, and have its configuration supplied from
command-line arguments.

Signed-off-by: Anatoly Burakov 
---
 devtools/gen-vscode-config.py | 640 ++
 1 file changed, 640 insertions(+)
 create mode 100755 devtools/gen-vscode-config.py

diff --git a/devtools/gen-vscode-config.py b/devtools/gen-vscode-config.py
new file mode 100755
index 00..0d291b6c17
--- /dev/null
+++ b/devtools/gen-vscode-config.py
@@ -0,0 +1,640 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2024 Intel Corporation
+#
+
+"""Visual Studio Code configuration generator script."""
+
+import os
+import json
+import argparse
+import fnmatch
+import shutil
+from typing import List, Dict, Tuple, Any
+from sys import exit as _exit, stderr
+from subprocess import run, CalledProcessError, PIPE
+from mesonbuild import mparser
+from mesonbuild.mesonlib import MesonException
+
+
+class DPDKBuildTask:
+"""A build task for DPDK"""
+
+def __init__(self, label: str, description: str, param: str):
+# label as it appears in build configuration
+self.label = label
+# description to be given in menu
+self.description = description
+# task-specific configuration parameters
+self.param = param
+
+def to_json_dict(self) -> Dict[str, Any]:
+"""Generate JSON dictionary for this task"""
+return {
+"label": f"Configure {self.label}",
+"detail": self.description,
+"type": "shell",
+"dependsOn": "Remove builddir",
+"command": f"meson setup ${{config:BUILDCONFIG}} {self.param} 
${{config:BUILDDIR}}",
+"problemMatcher": [],
+"group": "build"
+}
+
+
+class CmdlineCtx:
+"""POD class to set up command line parameters"""
+
+def __init__(self):
+self.use_ui = False
+self.use_gdbsudo = False
+self.build_dir: str = ""
+self.dpdk_dir: str = ""
+self.gdb_path: str = ""
+
+self.avail_configs: List[Tuple[str, str, str]] = []
+self.avail_apps: List[str] = []
+self.avail_examples: List[str] = []
+self.avail_drivers: List[str] = []
+
+self.enabled_configs: List[Tuple[str, str, str]] = []
+self.enabled_apps: List[str] = []
+self.enabled_examples: List[str] = []
+self.enabled_drivers: List[str] = []
+
+self.driver_dep_map: Dict[str, List[str]] = {}
+
+
+class DPDKLaunchTask:
+"""A launch task for DPDK"""
+
+def __init__(self, label: str, exe: str, gdb_path: str):
+# label as it appears in launch configuration
+self.label = label
+# path to executable
+self.exe = exe
+self.gdb_path = gdb_path
+
+def to_json_dict(self) -> Dict[str, Any]:
+"""Generate JSON dictionary for this task"""
+return {
+"name": f"Run {self.label}",
+"type": "cppdbg",
+"request": "launch",
+"program": f"${{config:BUILDDIR}}/{self.exe}",
+"args": [],
+"stopAtEntry": False,
+"cwd": "${workspaceFolder}",
+"externalConsole": False,
+"preLaunchTask": "Build",
+"MIMode": "gdb",
+"miDebuggerPath": self.gdb_p

[RFC PATCH v2 0/1] Add Visual Studio Code configuration script

2024-07-29 Thread Anatoly Burakov

 against this option.

The third option is what I went with, with "smarter" being defined as follows:

* User is allowed to use dialogs to edit configuration that is generated from
  parsing wildcard: if user changed something, we cannot keep wildcard any more
  and we assume user knows what they're doing and is OK with explicitly
  requesting compilation for drivers they selected. So, if user didn't change
  anything in the dialog, we keep the wildcard, otherwise we expand it.

* If, by the time we get to resolving driver dependencies, we have wildcards in
  our driver param string, we see which drivers match this wildcard, and add
  wildcards for their dependencies. For example, if "net/ice" requires
  "common/iavf", and we have a "net/*" wildcard, one of the dependencies that we
  will add is "common/*". This behavior is, IMO, far better than the default one
  from our build system, where if a driver matches wildcard but cannot be built
  due to another internal dependency not being enabled (e.g. if "net/ice" is
  requested but "common/iavf" isn't requested), the build will fail to configure
  even though it would've been possible to build them otherwise

So, explicitly enabled drivers get explicit dependencies, implicitly enabled
drivers get implicit dependencies. The resulting build will be bigger than when
using meson command line directly, but if the user is worried about build size,
they can customize it via common meson parameters as well as being more granular
about requested apps/examples/drivers. Thus, we address the "simple" usecase of
"let's build everything by default", we handle some common use cases smarter
than we otherwise would have, and we allow user to be as in-depth as they want
by allowing to specify explicit meson command strings. I feel like this is a
good compromise between usability and robustness.

Please feel free to make any suggestions!

[1] https://code.visualstudio.com/docs/remote/ssh

Anatoly Burakov (1):
  devtools: add vscode configuration generator

 devtools/gen-vscode-config.py | 871 ++
 1 file changed, 871 insertions(+)
 create mode 100755 devtools/gen-vscode-config.py

-- 
2.43.5

[RFC PATCH v2 1/1] devtools: add vscode configuration generator

2024-07-29 Thread Anatoly Burakov

A lot of developers use Visual Studio Code as their primary IDE. This
script generates a configuration file for VSCode that sets up basic build
tasks, launch tasks, as well as C/C++ code analysis settings that will
take into account compile_commands.json that is automatically generated
by meson.

Files generated by script:
 - .vscode/settings.json: stores variables needed by other files
 - .vscode/tasks.json: defines build tasks
 - .vscode/launch.json: defines launch tasks
 - .vscode/c_cpp_properties.json: defines code analysis settings

The script uses a combination of globbing and meson file parsing to
discover available apps, examples, and drivers, and generates a
project-wide settings file, so that the user can later switch between
debug/release/etc. configurations while keeping their desired apps,
examples, and drivers, built by meson, and ensuring launch configurations
still work correctly whatever the configuration selected.

This script uses whiptail as TUI, which is expected to be universally
available as it is shipped by default on most major distributions.
However, the script is also designed to be scriptable and can be run
without user interaction, and have its configuration supplied from
command-line arguments.

Signed-off-by: Anatoly Burakov 
---

Notes:
RFCv1 -> RFCv2:

- No longer disable apps and drivers if nothing was specified via command 
line
  or TUI, and warn user about things being built by default
- Generate app launch configuration by default for when no apps are selected
- Added paramters:
  - --force to avoid overwriting existing config
  - --common-conf to specify global meson flags applicable to all configs
  - --gdbsudo/--no-gdbsudo to specify gdbsudo behavior
- Autodetect gdbsudo/gdb from UID
- Updated comments, error messages, fixed issues with user interaction
- Improved handling of wildcards and driver dependencies
- Fixed a few bugs in dependency detection due to incorrect parsing
- [Stephen] flake8 is happy

 devtools/gen-vscode-config.py | 871 ++
 1 file changed, 871 insertions(+)
 create mode 100755 devtools/gen-vscode-config.py

diff --git a/devtools/gen-vscode-config.py b/devtools/gen-vscode-config.py
new file mode 100755
index 00..f0d6044c1b
--- /dev/null
+++ b/devtools/gen-vscode-config.py
@@ -0,0 +1,871 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2024 Intel Corporation
+#
+
+"""Visual Studio Code configuration generator script."""
+
+import os
+import json
+import argparse
+import fnmatch
+import shutil
+from typing import List, Dict, Tuple, Any
+from sys import exit as _exit, stderr
+from subprocess import run, CalledProcessError, PIPE
+from mesonbuild import mparser
+from mesonbuild.mesonlib import MesonException
+
+
+class DPDKBuildTask:
+"""A build task for DPDK"""
+
+def __init__(self, label: str, description: str, param: str):
+# label as it appears in build configuration
+self.label = label
+# description to be given in menu
+self.description = description
+# task-specific configuration parameters
+self.param = param
+
+def to_json_dict(self) -> Dict[str, Any]:
+"""Generate JSON dictionary for this task"""
+return {
+"label": f"Configure {self.label}",
+"detail": self.description,
+"type": "shell",
+"dependsOn": "Remove builddir",
+# take configuration from settings.json using config: namespace
+"command": f"meson setup ${{config:BUILDCONFIG}} " \
+   f"{self.param} ${{config:BUILDDIR}}",
+"problemMatcher": [],
+"group": "build"
+}
+
+
+class DPDKLaunchTask:
+"""A launch task for DPDK"""
+
+def __init__(self, label: str, exe: str, gdb_path: str):
+# label as it appears in launch configuration
+self.label = label
+# path to executable
+self.exe = exe
+self.gdb_path = gdb_path
+
+def to_json_dict(self) -> Dict[str, Any]:
+"""Generate JSON dictionary for this task"""
+return {
+"name": f"Run {self.label}",
+"type": "cppdbg",
+"request": "launch",
+# take configuration from settings.json using config: namespace
+"program": f"${{config:BUILDDIR}}/{self.exe}",
+"args": [],
+"stopAtEntry": False,
+"cwd": "${workspaceFolder}",
+"externalConsole&

[RFC PATCH v3 0/1] Add Visual Studio Code configuration script

2024-07-31 Thread Anatoly Burakov

Lots of developers (myself included) uses Visual Studio Code as their primary
IDE for DPDK development. I have been successfully using various incarnations of
this script internally to quickly set up my development trees whenever I need a
new configuration, so this script is being shared in hopes that it will be
useful both to new developers starting with DPDK, and to seasoned DPDK
developers who are already using Visual Studio Code. It makes starting working
on DPDK in Visual Studio Code so much easier!

** NOTE: Currently, only x86 configuration is generated as I have no way to test
   the code analysis configuration on any other platforms.

** NOTE 2: this is not for *Visual Studio* the Windows IDE, this is for *Visual
   Studio Code* the cross-platform code editor. Specifically, main target
   audience for this script is people who either run DPDK directly on their
   Linux machine, or who use Remote SSH functionality to connect to a remote
   Linux machine and set up VSCode build there. No other OS's are currently
   supported by the script.

(if you're unaware of what is Remote SSH, I highly suggest checking it out [1])

Philosophy behind this script is as follows:

- Any build directory created will automatically add itself to VSCode
  configuration (ignore mechanism for e.g. test-meson-build.sh is WIP)

- Launch configuration is created using `which gdb`, so by default non-root
  users will have to do additional system configuration for things to work

- All of the interactive stuff has now been taken out and is planned to be
  included in a separate set of scripts, so this script now concerns itself only
  with adding build/launch targets to user's configuration and not much else

Please feel free to make any suggestions!

[1] https://code.visualstudio.com/docs/remote/ssh

Anatoly Burakov (1):
  buildtools: add vscode configuration generator

 app/meson.build   |  12 +-
 buildtools/gen-vscode-conf.py | 442 ++
 buildtools/meson.build|   5 +
 examples/meson.build  |  13 +-
 meson.build   |  11 +
 5 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100755 buildtools/gen-vscode-conf.py

-- 
2.43.5

[RFC PATCH v3 1/1] buildtools: add vscode configuration generator

2024-07-31 Thread Anatoly Burakov

A lot of developers use Visual Studio Code as their primary IDE. This
script will be called from within meson build process, and will generate
a configuration file for VSCode that sets up basic build tasks, launch
tasks, as well as C/C++ code analysis settings that will take into
account compile_commands.json that is automatically generated by meson.

Files generated by script:
 - .vscode/settings.json: stores variables needed by other files
 - .vscode/tasks.json: defines build tasks
 - .vscode/launch.json: defines launch tasks
 - .vscode/c_cpp_properties.json: defines code analysis settings

Multiple, as well as out-of-source-tree, build directories are supported,
and the script will generate separate configuration items for each build
directory created by user, tagging them for convenience.

Signed-off-by: Anatoly Burakov 
---

Notes:
RFCv3 -> RFCv2:
- Following feedback from Bruce, reworked to be minimal script run from 
meson
- Moved to buildtools
- Support for multiple build directories is now the default
- All targets are automatically added to all configuration files

RFCv1 -> RFCv2:

- No longer disable apps and drivers if nothing was specified via command 
line
  or TUI, and warn user about things being built by default
- Generate app launch configuration by default for when no apps are selected
- Added paramters:
  - --force to avoid overwriting existing config
  - --common-conf to specify global meson flags applicable to all configs
  - --gdbsudo/--no-gdbsudo to specify gdbsudo behavior
- Autodetect gdbsudo/gdb from UID
- Updated comments, error messages, fixed issues with user interaction
- Improved handling of wildcards and driver dependencies
- Fixed a few bugs in dependency detection due to incorrect parsing
- [Stephen] flake8 is happy

 app/meson.build   |  12 +-
 buildtools/gen-vscode-conf.py | 442 ++
 buildtools/meson.build|   5 +
 examples/meson.build  |  13 +-
 meson.build   |  11 +
 5 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100755 buildtools/gen-vscode-conf.py

diff --git a/app/meson.build b/app/meson.build
index 5b2c80c7a1..cf0eda3d5f 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -114,7 +114,17 @@ foreach app:apps
 link_libs = dpdk_static_libraries + dpdk_drivers
 endif
 
-exec = executable('dpdk-' + name,
+# add to Visual Studio Code launch configuration
+exe_name = 'dpdk-' + name
+launch_path = join_paths(meson.current_build_dir(), exe_name)
+# we don't want to block the build if this command fails
+result = run_command(vscode_conf_gen_cmd + ['--launch', launch_path], 
check: false)
+if result.returncode() != 0
+warning('Failed to generate Visual Studio Code launch configuration 
for "' + name + '"')
+message(result.stderr())
+endif
+
+exec = executable(exe_name,
 sources,
 c_args: cflags,
 link_args: ldflags,
diff --git a/buildtools/gen-vscode-conf.py b/buildtools/gen-vscode-conf.py
new file mode 100755
index 00..fcc6469065
--- /dev/null
+++ b/buildtools/gen-vscode-conf.py
@@ -0,0 +1,442 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2024 Intel Corporation
+#
+
+"""Visual Studio Code configuration generator script."""
+
+# This script is meant to be run by meson build system to generate build and
+# launch commands for a specific build directory for Visual Studio Code IDE.
+#
+# Even though this script will generate settings/tasks/launch/code analysis
+# configuration for VSCode, we can't actually just regenerate the files,
+# because we want to support multiple build directories, as well as not
+# destroy any configuration user has created between runs of this script.
+# Therefore, we need some config file handling infrastructure. Luckily, VSCode
+# configs are all JSON, so we can just use json module to handle them. Of
+# course, we will lose any user comments in the files, but that's a small price
+# to pay for this sort of automation.
+#
+# Since this script will be run by meson, we can forego any parsing or anything
+# to do with the build system, and just rely on the fact that we get all of our
+# configuration from command-line.
+
+import argparse
+import ast
+import json
+import os
+import shutil
+from collections import OrderedDict
+from sys import stderr, exit as _exit
+from typing import List, Dict, Any
+
+
+class ConfigCtx:
+"""POD class to keep data associated with config."""
+def __init__(self, build_dir: str, source_dir: str, launch: List[str]):
+self.build_dir = build_dir
+self.source_dir = source_dir
+self.config_dir = os.path.join(source_dir, '.vscode')
+

[PATCH v1 1/2] usertools/cpu_layout: update coding style

2024-08-14 Thread Anatoly Burakov

Update coding style:

- make it PEP-484 compliant
- address all flake8, mypy etc. warnings
- use f-strings in place of old-style string interpolation
- refactor printing to make the code more readable

Signed-off-by: Anatoly Burakov 
---
 usertools/cpu_layout.py | 162 ++--
 1 file changed, 104 insertions(+), 58 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..843b29a134 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -3,62 +3,108 @@
 # Copyright(c) 2010-2014 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
-sockets = []
-cores = []
-core_map = {}
-base_path = "/sys/devices/system/cpu"
-fd = open("{}/kernel_max".format(base_path))
-max_cpus = int(fd.read())
-fd.close()
-for cpu in range(max_cpus + 1):
-try:
-fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu))
-except IOError:
-continue
-core = int(fd.read())
-fd.close()
-fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
-socket = int(fd.read())
-fd.close()
-if core not in cores:
-cores.append(core)
-if socket not in sockets:
-sockets.append(socket)
-key = (socket, core)
-if key not in core_map:
-core_map[key] = []
-core_map[key].append(cpu)
-
-print(format("=" * (47 + len(base_path
-print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
-print("cores = ", cores)
-print("sockets = ", sockets)
-print("")
-
-max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1))
-max_thread_count = len(list(core_map.values())[0])
-max_core_map_len = (max_processor_len * max_thread_count)  \
-  + len(", ") * (max_thread_count - 1) \
-  + len('[]') + len('Socket ')
-max_core_id_len = len(str(max(cores)))
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
-print(output)
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " ".ljust(max_core_map_len)
-output += " "
-print(output)
-
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
+from typing import List, Set, Dict, Tuple
+
+
+def _range_expand(rstr: str) -> List[int]:
+"""Expand a range string into a list of integers."""
+# 0,1-3 => [0, 1-3]
+ranges = rstr.split(",")
+valset: List[int] = []
+for r in ranges:
+# 1-3 => [1, 2, 3]
+if "-" in r:
+start, end = r.split("-")
+valset.extend(range(int(start), int(end) + 1))
 else:
-output += " " * (max_core_map_len + 1)
-print(output)
+valset.append(int(r))
+return valset
+
+
+def _read_sysfs(path: str) -> str:
+with open(path) as fd:
+return fd.read().strip()
+
+
+def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None:
+first, *rest = row
+w_first, *w_rest = col_widths
+first_end = " " * 4
+rest_end = " " * 10
+
+print(first.ljust(w_first), end=first_end)
+for cell, width in zip(rest, w_rest):
+print(cell.rjust(width), end=rest_end)
+print()
+
+
+def _print_section(heading: str) -> None:
+sep = "=" * len(heading)
+print(sep)
+print(heading)
+print(sep)
+print()
+
+
+def _main() -> None:
+sockets_s: Set[int] = set()
+cores_s: Set[int] = set()
+core_map: Dict[Tuple[int, int], List[int]] = {}
+base_path = "/sys/devices/system/cpu"
+
+cpus = _range_expand(_read_sysfs(f"{base_path}/online"))
+
+for cpu in cpus:
+lcore_base = f"{base_path}/cpu{cpu}"
+core = int(_read_sysfs(f"{lcore_base}/topology/core_id"))
+socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+
+cores_s.add(core)
+sockets_s.add(socket)
+key = (socket, core)
+core_map.setdefault(key, [])
+core_map[key].append(cpu)
+
+cores = sorted(cores_s)
+sockets = sorted(sockets_s)
+
+_print_section("Core and Socket Information "
+   f"(as reported by '{base_path}')")
+
+print("cores = ", cores)
+print("sockets = ", sockets)
+print()
+
+# Core, [Socket, Socket, ...]
+heading_strs =

[PATCH v1 2/2] usertools/cpu_layout: print out NUMA nodes

2024-08-14 Thread Anatoly Burakov

In traditional NUMA case, NUMA nodes and physical sockets were used
interchangeably, but there are cases where there can be multiple NUMA
nodes per socket, as well as all CPU's being assigned NUMA node 0 even in
cases of multiple sockets. Use sysfs to print out NUMA information.

Signed-off-by: Anatoly Burakov 
---
 usertools/cpu_layout.py | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 843b29a134..be89909464 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -4,6 +4,7 @@
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
 from typing import List, Set, Dict, Tuple
+import glob
 
 
 def _range_expand(rstr: str) -> List[int]:
@@ -26,11 +27,19 @@ def _read_sysfs(path: str) -> str:
 return fd.read().strip()
 
 
+def _read_numa_node(base: str) -> int:
+node_glob = f"{base}/node*"
+node_dirs = glob.glob(node_glob)
+if not node_dirs:
+return 0  # default to node 0
+return int(node_dirs[0].split("node")[1])
+
+
 def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None:
 first, *rest = row
 w_first, *w_rest = col_widths
 first_end = " " * 4
-rest_end = " " * 10
+rest_end = " " * 4
 
 print(first.ljust(w_first), end=first_end)
 for cell, width in zip(rest, w_rest):
@@ -50,6 +59,7 @@ def _main() -> None:
 sockets_s: Set[int] = set()
 cores_s: Set[int] = set()
 core_map: Dict[Tuple[int, int], List[int]] = {}
+numa_map: Dict[int, int] = {}
 base_path = "/sys/devices/system/cpu"
 
 cpus = _range_expand(_read_sysfs(f"{base_path}/online"))
@@ -58,12 +68,14 @@ def _main() -> None:
 lcore_base = f"{base_path}/cpu{cpu}"
 core = int(_read_sysfs(f"{lcore_base}/topology/core_id"))
 socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+node = _read_numa_node(lcore_base)
 
 cores_s.add(core)
 sockets_s.add(socket)
 key = (socket, core)
 core_map.setdefault(key, [])
 core_map[key].append(cpu)
+numa_map[cpu] = node
 
 cores = sorted(cores_s)
 sockets = sorted(sockets_s)
@@ -73,24 +85,37 @@ def _main() -> None:
 
 print("cores = ", cores)
 print("sockets = ", sockets)
+print("numa = ", sorted(set(numa_map.values(
 print()
 
-# Core, [Socket, Socket, ...]
-heading_strs = "", *[f"Socket {s}" for s in sockets]
+# Core, [NUMA, Socket, NUMA, Socket, ...]
+heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")]
 sep_strs = tuple("-" * len(hstr) for hstr in heading_strs)
 rows: List[Tuple[str, ...]] = []
 
+prev_numa = None
 for c in cores:
 # Core,
 row: Tuple[str, ...] = (f"Core {c}",)
 
-# [lcores, lcores, ...]
+# assume NUMA changes symmetrically
+first_lcore = core_map[(0, c)][0]
+cur_numa = numa_map[first_lcore]
+numa_changed = prev_numa != cur_numa
+prev_numa = cur_numa
+
+# [NUMA, lcores, NUMA, lcores, ...]
 for s in sockets:
 try:
 lcores = core_map[(s, c)]
+numa = numa_map[lcores[0]]
+if numa_changed:
+row += (f"NUMA {numa}",)
+else:
+row += ("",)
 row += (f"{lcores}",)
 except KeyError:
-row += ("",)
+row += ("", "")
 rows += [row]
 
 # find max widths for each column, including header and rows
-- 
2.43.5

[PATCH v2 1/4] usertools/cpu_layout: update coding style

2024-08-16 Thread Anatoly Burakov

Update coding style:

- make it PEP-484 compliant
- address all flake8, mypy etc. warnings
- use f-strings in place of old-style string interpolation
- refactor printing to make the code more readable

Signed-off-by: Anatoly Burakov 
---
 usertools/cpu_layout.py | 162 ++--
 1 file changed, 104 insertions(+), 58 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..be86f06938 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -3,62 +3,108 @@
 # Copyright(c) 2010-2014 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
-sockets = []
-cores = []
-core_map = {}
-base_path = "/sys/devices/system/cpu"
-fd = open("{}/kernel_max".format(base_path))
-max_cpus = int(fd.read())
-fd.close()
-for cpu in range(max_cpus + 1):
-try:
-fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu))
-except IOError:
-continue
-core = int(fd.read())
-fd.close()
-fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
-socket = int(fd.read())
-fd.close()
-if core not in cores:
-cores.append(core)
-if socket not in sockets:
-sockets.append(socket)
-key = (socket, core)
-if key not in core_map:
-core_map[key] = []
-core_map[key].append(cpu)
-
-print(format("=" * (47 + len(base_path
-print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
-print("cores = ", cores)
-print("sockets = ", sockets)
-print("")
-
-max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1))
-max_thread_count = len(list(core_map.values())[0])
-max_core_map_len = (max_processor_len * max_thread_count)  \
-  + len(", ") * (max_thread_count - 1) \
-  + len('[]') + len('Socket ')
-max_core_id_len = len(str(max(cores)))
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
-print(output)
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " ".ljust(max_core_map_len)
-output += " "
-print(output)
-
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
+from typing import List, Set, Dict, Tuple
+
+
+def _range_expand(rstr: str) -> List[int]:
+"""Expand a range string into a list of integers."""
+# 0,1-3 => [0, 1-3]
+ranges = rstr.split(",")
+valset: List[int] = []
+for r in ranges:
+# 1-3 => [1, 2, 3]
+if "-" in r:
+start, end = r.split("-")
+valset.extend(range(int(start), int(end) + 1))
 else:
-output += " " * (max_core_map_len + 1)
-print(output)
+valset.append(int(r))
+return valset
+
+
+def _read_sysfs(path: str) -> str:
+with open(path, encoding="utf-8") as fd:
+return fd.read().strip()
+
+
+def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None:
+first, *rest = row
+w_first, *w_rest = col_widths
+first_end = " " * 4
+rest_end = " " * 10
+
+print(first.ljust(w_first), end=first_end)
+for cell, width in zip(rest, w_rest):
+print(cell.rjust(width), end=rest_end)
+print()
+
+
+def _print_section(heading: str) -> None:
+sep = "=" * len(heading)
+print(sep)
+print(heading)
+print(sep)
+print()
+
+
+def _main() -> None:
+sockets_s: Set[int] = set()
+cores_s: Set[int] = set()
+core_map: Dict[Tuple[int, int], List[int]] = {}
+base_path = "/sys/devices/system/cpu"
+
+cpus = _range_expand(_read_sysfs(f"{base_path}/online"))
+
+for cpu in cpus:
+lcore_base = f"{base_path}/cpu{cpu}"
+core = int(_read_sysfs(f"{lcore_base}/topology/core_id"))
+socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+
+cores_s.add(core)
+sockets_s.add(socket)
+key = (socket, core)
+core_map.setdefault(key, [])
+core_map[key].append(cpu)
+
+cores = sorted(cores_s)
+sockets = sorted(sockets_s)
+
+_print_section("Core and Socket Information "
+   f"(as reported by '{base_path}')")
+
+print("cores = ", cores)
+print("sockets = ", sockets)
+print()
+
+# Core,

[PATCH v2 2/4] usertools/cpu_layout: print out NUMA nodes

2024-08-16 Thread Anatoly Burakov

In traditional NUMA case, NUMA nodes and physical sockets were used
interchangeably, but there are cases where there can be multiple NUMA
nodes per socket, as well as all CPU's being assigned NUMA node 0 even in
cases of multiple sockets. Use sysfs to print out NUMA information.

Signed-off-by: Anatoly Burakov 
---
 usertools/cpu_layout.py | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index be86f06938..e43bdbf343 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -4,6 +4,7 @@
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
 from typing import List, Set, Dict, Tuple
+import glob
 
 
 def _range_expand(rstr: str) -> List[int]:
@@ -26,11 +27,19 @@ def _read_sysfs(path: str) -> str:
 return fd.read().strip()
 
 
+def _read_numa_node(base: str) -> int:
+node_glob = f"{base}/node*"
+node_dirs = glob.glob(node_glob)
+if not node_dirs:
+return 0  # default to node 0
+return int(node_dirs[0].split("node")[1])
+
+
 def _print_row(row: Tuple[str, ...], col_widths: List[int]) -> None:
 first, *rest = row
 w_first, *w_rest = col_widths
 first_end = " " * 4
-rest_end = " " * 10
+rest_end = " " * 4
 
 print(first.ljust(w_first), end=first_end)
 for cell, width in zip(rest, w_rest):
@@ -50,6 +59,7 @@ def _main() -> None:
 sockets_s: Set[int] = set()
 cores_s: Set[int] = set()
 core_map: Dict[Tuple[int, int], List[int]] = {}
+numa_map: Dict[int, int] = {}
 base_path = "/sys/devices/system/cpu"
 
 cpus = _range_expand(_read_sysfs(f"{base_path}/online"))
@@ -58,12 +68,14 @@ def _main() -> None:
 lcore_base = f"{base_path}/cpu{cpu}"
 core = int(_read_sysfs(f"{lcore_base}/topology/core_id"))
 socket = int(_read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+node = _read_numa_node(lcore_base)
 
 cores_s.add(core)
 sockets_s.add(socket)
 key = (socket, core)
 core_map.setdefault(key, [])
 core_map[key].append(cpu)
+numa_map[cpu] = node
 
 cores = sorted(cores_s)
 sockets = sorted(sockets_s)
@@ -73,24 +85,37 @@ def _main() -> None:
 
 print("cores = ", cores)
 print("sockets = ", sockets)
+print("numa = ", sorted(set(numa_map.values(
 print()
 
-# Core, [Socket, Socket, ...]
-heading_strs = "", *[f"Socket {s}" for s in sockets]
+# Core, [NUMA, Socket, NUMA, Socket, ...]
+heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")]
 sep_strs = tuple("-" * len(hstr) for hstr in heading_strs)
 rows: List[Tuple[str, ...]] = []
 
+prev_numa = None
 for c in cores:
 # Core,
 row: Tuple[str, ...] = (f"Core {c}",)
 
-# [lcores, lcores, ...]
+# assume NUMA changes symmetrically
+first_lcore = core_map[(0, c)][0]
+cur_numa = numa_map[first_lcore]
+numa_changed = prev_numa != cur_numa
+prev_numa = cur_numa
+
+# [NUMA, lcores, NUMA, lcores, ...]
 for s in sockets:
 try:
 lcores = core_map[(s, c)]
+numa = numa_map[lcores[0]]
+if numa_changed:
+row += (f"NUMA {numa}",)
+else:
+row += ("",)
 row += (str(lcores),)
 except KeyError:
-row += ("",)
+row += ("", "")
 rows += [row]
 
 # find max widths for each column, including header and rows
-- 
2.43.5

[PATCH v2 3/4] usertools/dpdk-hugepages.py: sort by NUMA node

2024-08-16 Thread Anatoly Burakov

Currently, the list of per-NUMA node hugepages is displayed in glob order,
which can be arbitrary. Fix it to sort the glob order.

Signed-off-by: Anatoly Burakov 
---
 usertools/dpdk-hugepages.py | 40 ++---
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py
index bf2575ba36..54232ddf22 100755
--- a/usertools/dpdk-hugepages.py
+++ b/usertools/dpdk-hugepages.py
@@ -74,21 +74,37 @@ def set_hugepages(path, reqpages):
  gotpages, reqpages, filename))
 
 
+def get_numa_pages_node(node):
+'''Read list of hugepage reservations on specific NUMA node'''
+hp_path = f'/sys/devices/system/node/node{node}/hugepages'
+if not os.path.exists(hp_path):
+return
+res = []
+for pg_sz_dir in os.listdir(hp_path):
+pg_sz = int(pg_sz_dir[10:-2])
+nr_pages = get_hugepages(f'{hp_path}/{pg_sz_dir}')
+if nr_pages > 0:
+pg_sz_str = fmt_memsize(pg_sz)
+total_sz_str = fmt_memsize(nr_pages * pg_sz)
+res += [(nr_pages, pg_sz_str, total_sz_str)]
+else:
+res += [(0, None, None)]
+return res
+
+
 def show_numa_pages():
 '''Show huge page reservations on Numa system'''
+# get list of NUMA nodes and sort them by integer order
 print('Node Pages Size Total')
-for numa_path in glob.glob('/sys/devices/system/node/node*'):
-node = numa_path[29:]  # slice after /sys/devices/system/node/node
-path = numa_path + '/hugepages'
-if not os.path.exists(path):
-continue
-for hdir in os.listdir(path):
-pages = get_hugepages(path + '/' + hdir)
-if pages > 0:
-kb = int(hdir[10:-2])  # slice out of hugepages-NNNkB
-print('{:<4} {:<5} {:<6} {}'.format(node, pages,
-fmt_memsize(kb),
-fmt_memsize(pages * kb)))
+nodes = sorted(int(node[29:])
+   for node in glob.glob('/sys/devices/system/node/node*'))
+for node in nodes:
+pg_sz_data = get_numa_pages_node(node)
+for nr_pages, pg_sz, total_sz in pg_sz_data:
+if not nr_pages:
+continue
+print('{:<4} {:<5} {:<6} {}'
+  .format(node, nr_pages, pg_sz, total_sz))
 
 
 def show_non_numa_pages():
-- 
2.43.5

[PATCH v2 4/4] usertools/dpdk-devbind: print NUMA node

2024-08-16 Thread Anatoly Burakov

Currently, devbind does not print out any NUMA information, which makes
figuring out which NUMA node device belongs to not trivial. Add printouts
for NUMA information if NUMA support is enabled on the system.

Signed-off-by: Anatoly Burakov 
---
 usertools/dpdk-devbind.py | 27 +++
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py
index b276e8efc8..c0611a501d 100755
--- a/usertools/dpdk-devbind.py
+++ b/usertools/dpdk-devbind.py
@@ -110,6 +110,11 @@
 args = []
 
 
+# check if this system has NUMA support
+def is_numa():
+return os.path.exists('/sys/devices/system/node')
+
+
 # check if a specific kernel module is loaded
 def module_is_loaded(module):
 global loaded_modules
@@ -579,18 +584,24 @@ def show_device_status(devices_type, device_name, 
if_field=False):
 
 # print each category separately, so we can clearly see what's used by DPDK
 if dpdk_drv:
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
+if is_numa():
+extra_param = "numa_node=%(NUMANode)s " + extra_param
 display_devices("%s devices using DPDK-compatible driver" % 
device_name,
-dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s")
+dpdk_drv, extra_param)
 if kernel_drv:
-if_text = ""
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
 if if_field:
-if_text = "if=%(Interface)s "
-display_devices("%s devices using kernel driver" % device_name, 
kernel_drv,
-if_text + "drv=%(Driver_str)s "
-"unused=%(Module_str)s %(Active)s")
+extra_param = "if=%(Interface)s " + extra_param
+if is_numa():
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("%s devices using kernel driver" % device_name,
+kernel_drv, extra_param)
 if no_drv:
-display_devices("Other %s devices" % device_name, no_drv,
-"unused=%(Module_str)s")
+extra_param = "unused=%(Module_str)s"
+if is_numa():
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("Other %s devices" % device_name, no_drv, extra_param)
 
 
 def show_status():
-- 
2.43.5

[PATCH v3 1/4] usertools/cpu_layout: update coding style

2024-08-20 Thread Anatoly Burakov

Update coding style:

- make it PEP-484 compliant
- address all flake8, mypy etc. warnings
- use f-strings in place of old-style string interpolation
- refactor printing to make the code more readable
- read valid CPU ID's from "online" sysfs node

Signed-off-by: Anatoly Burakov 
---

Notes:
v1,v2 -> v3:
- Import typing as T instead of individual types

 usertools/cpu_layout.py | 162 ++--
 1 file changed, 107 insertions(+), 55 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..1c255ff1a1 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -3,62 +3,114 @@
 # Copyright(c) 2010-2014 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
-sockets = []
-cores = []
-core_map = {}
-base_path = "/sys/devices/system/cpu"
-fd = open("{}/kernel_max".format(base_path))
-max_cpus = int(fd.read())
-fd.close()
-for cpu in range(max_cpus + 1):
-try:
-fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu))
-except IOError:
-continue
-core = int(fd.read())
-fd.close()
-fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
-socket = int(fd.read())
-fd.close()
-if core not in cores:
-cores.append(core)
-if socket not in sockets:
-sockets.append(socket)
-key = (socket, core)
-if key not in core_map:
-core_map[key] = []
-core_map[key].append(cpu)
+"""Display CPU topology information."""
 
-print(format("=" * (47 + len(base_path
-print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
-print("cores = ", cores)
-print("sockets = ", sockets)
-print("")
+import typing as T
 
-max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1))
-max_thread_count = len(list(core_map.values())[0])
-max_core_map_len = (max_processor_len * max_thread_count)  \
-  + len(", ") * (max_thread_count - 1) \
-  + len('[]') + len('Socket ')
-max_core_id_len = len(str(max(cores)))
 
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
-print(output)
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " ".ljust(max_core_map_len)
-output += " "
-print(output)
-
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
+def range_expand(rstr: str) -> T.List[int]:
+"""Expand a range string into a list of integers."""
+# 0,1-3 => [0, 1-3]
+ranges = rstr.split(",")
+valset: T.List[int] = []
+for r in ranges:
+# 1-3 => [1, 2, 3]
+if "-" in r:
+start, end = r.split("-")
+valset.extend(range(int(start), int(end) + 1))
 else:
-output += " " * (max_core_map_len + 1)
-print(output)
+valset.append(int(r))
+return valset
+
+
+def read_sysfs(path: str) -> str:
+"""Read a sysfs file and return its contents."""
+with open(path, encoding="utf-8") as fd:
+return fd.read().strip()
+
+
+def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
+"""Print a row of a table with the given column widths."""
+first, *rest = row
+w_first, *w_rest = col_widths
+first_end = " " * 4
+rest_end = " " * 10
+
+print(first.ljust(w_first), end=first_end)
+for cell, width in zip(rest, w_rest):
+print(cell.rjust(width), end=rest_end)
+print()
+
+
+def print_section(heading: str) -> None:
+"""Print a section heading."""
+sep = "=" * len(heading)
+print(sep)
+print(heading)
+print(sep)
+print()
+
+
+def main() -> None:
+"""Print CPU topology information."""
+sockets_s: T.Set[int] = set()
+cores_s: T.Set[int] = set()
+core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+base_path = "/sys/devices/system/cpu"
+
+cpus = range_expand(read_sysfs(f"{base_path}/online"))
+
+for cpu in cpus:
+lcore_base = f"{base_path}/cpu{cpu}"
+core = int(read_sysfs(f"{lcore_base}/topology/core_id"))
+socket = int(read_sysfs(f"{lcore_base}/topolog

[PATCH v3 2/4] usertools/cpu_layout: print out NUMA nodes

2024-08-20 Thread Anatoly Burakov

In traditional NUMA case, NUMA nodes and physical sockets were used
interchangeably, but there are cases where there can be multiple NUMA
nodes per socket, as well as all CPU's being assigned NUMA node 0 even in
cases of multiple sockets. Use sysfs to print out NUMA information.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2 -> v3:
- Sort imports alphabetically

 usertools/cpu_layout.py | 36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 1c255ff1a1..78b119d729 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -5,6 +5,7 @@
 
 """Display CPU topology information."""
 
+import glob
 import typing as T
 
 
@@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str:
 return fd.read().strip()
 
 
+def read_numa_node(base: str) -> int:
+"""Read the NUMA node of a CPU."""
+node_glob = f"{base}/node*"
+node_dirs = glob.glob(node_glob)
+if not node_dirs:
+return 0  # default to node 0
+return int(node_dirs[0].split("node")[1])
+
+
 def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
 """Print a row of a table with the given column widths."""
 first, *rest = row
 w_first, *w_rest = col_widths
 first_end = " " * 4
-rest_end = " " * 10
+rest_end = " " * 4
 
 print(first.ljust(w_first), end=first_end)
 for cell, width in zip(rest, w_rest):
@@ -56,6 +66,7 @@ def main() -> None:
 sockets_s: T.Set[int] = set()
 cores_s: T.Set[int] = set()
 core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+numa_map: T.Dict[int, int] = {}
 base_path = "/sys/devices/system/cpu"
 
 cpus = range_expand(read_sysfs(f"{base_path}/online"))
@@ -64,12 +75,14 @@ def main() -> None:
 lcore_base = f"{base_path}/cpu{cpu}"
 core = int(read_sysfs(f"{lcore_base}/topology/core_id"))
 socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+node = read_numa_node(lcore_base)
 
 cores_s.add(core)
 sockets_s.add(socket)
 key = (socket, core)
 core_map.setdefault(key, [])
 core_map[key].append(cpu)
+numa_map[cpu] = node
 
 cores = sorted(cores_s)
 sockets = sorted(sockets_s)
@@ -79,24 +92,37 @@ def main() -> None:
 
 print("cores = ", cores)
 print("sockets = ", sockets)
+print("numa = ", sorted(set(numa_map.values(
 print()
 
-# Core, [Socket, Socket, ...]
-heading_strs = "", *[f"Socket {s}" for s in sockets]
+# Core, [NUMA, Socket, NUMA, Socket, ...]
+heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")]
 sep_strs = tuple("-" * len(hstr) for hstr in heading_strs)
 rows: T.List[T.Tuple[str, ...]] = []
 
+prev_numa = None
 for c in cores:
 # Core,
 row: T.Tuple[str, ...] = (f"Core {c}",)
 
-# [lcores, lcores, ...]
+# assume NUMA changes symmetrically
+first_lcore = core_map[(0, c)][0]
+cur_numa = numa_map[first_lcore]
+numa_changed = prev_numa != cur_numa
+prev_numa = cur_numa
+
+# [NUMA, lcores, NUMA, lcores, ...]
 for s in sockets:
 try:
 lcores = core_map[(s, c)]
+numa = numa_map[lcores[0]]
+if numa_changed:
+row += (f"NUMA {numa}",)
+else:
+row += ("",)
 row += (str(lcores),)
 except KeyError:
-row += ("",)
+row += ("", "")
 rows += [row]
 
 # find max widths for each column, including header and rows
-- 
2.43.5

[PATCH v3 3/4] usertools/dpdk-hugepages.py: update coding style

2024-08-20 Thread Anatoly Burakov

Update coding style:

- Make the code PEP-484 compliant
- Add more comments, improve readability, use f-strings everywhere
- Use quotes consistently
- Address all Python static analysis (e.g. mypy, pylint) warnings
- Improve error handling
- Refactor printing and sysfs/procfs access functions
- Sort output by NUMA node

Signed-off-by: Anatoly Burakov 
---

Notes:
v1 -> v2:
  - Added commit that sorted output by NUMA node
v2 -> v3:
  - Rewrite of the script as suggested by reviewers

 usertools/dpdk-hugepages.py | 456 +---
 1 file changed, 273 insertions(+), 183 deletions(-)

diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py
index bf2575ba36..510822af60 100755
--- a/usertools/dpdk-hugepages.py
+++ b/usertools/dpdk-hugepages.py
@@ -1,167 +1,136 @@
 #! /usr/bin/env python3
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2020 Microsoft Corporation
-"""Script to query and setup huge pages for DPDK applications."""
+
+'''Script to query and setup huge pages for DPDK applications.'''
 
 import argparse
-import glob
 import os
 import re
+import subprocess
 import sys
+import typing as T
 from math import log2
 
 # Standard binary prefix
-BINARY_PREFIX = "KMG"
+BINARY_PREFIX = 'KMG'
 
 # systemd mount point for huge pages
-HUGE_MOUNT = "/dev/hugepages"
+HUGE_MOUNT = '/dev/hugepages'
+# default directory for non-NUMA huge pages
+NO_NUMA_HUGE_DIR = '/sys/kernel/mm/hugepages'
+# default base directory for NUMA nodes
+NUMA_NODE_BASE_DIR = '/sys/devices/system/node'
+# procfs paths
+MEMINFO_PATH = '/proc/meminfo'
+MOUNTS_PATH = '/proc/mounts'
 
 
-def fmt_memsize(kb):
+class HugepageMount:
+'''Mount operations for huge page filesystem.'''
+
+def __init__(self, path: str, mounted: bool):
+self.path = path
+# current mount status
+self.mounted = mounted
+
+def mount(self, pagesize_kb: int,
+  user: T.Optional[str], group: T.Optional[str]) -> None:
+'''Mount the huge TLB file system'''
+if self.mounted:
+return
+cmd = ['mount', '-t', 'hugetlbfs']
+cmd += ['-o', f'pagesize={pagesize_kb * 1024}']
+if user is not None:
+cmd += ['-o', f'uid={user}']
+if group is not None:
+cmd += ['-o', f'gid={group}']
+cmd += ['nodev', self.path]
+
+subprocess.run(cmd, check=True)
+self.mounted = True
+
+def unmount(self) -> None:
+'''Unmount the huge TLB file system (if mounted)'''
+if self.mounted:
+subprocess.run(['umount', self.path], check=True)
+self.mounted = False
+
+
+class HugepageRes:
+'''Huge page reserve operations. Can be NUMA-node-specific.'''
+
+def __init__(self, path: str, node: T.Optional[int] = None):
+self.path = path
+# if this is a per-NUMA node huge page dir, store the node number
+self.node = node
+self.valid_page_sizes = self._get_valid_page_sizes()
+
+def _get_valid_page_sizes(self) -> T.List[int]:
+'''Extract valid huge page sizes'''
+return [get_memsize(d.split('-')[1])
+for d in os.listdir(self.path)]
+
+def _nr_pages_path(self, sz: int) -> str:
+if sz not in self.valid_page_sizes:
+raise ValueError(f'Invalid page size {sz}. '
+ f'Valid sizes: {self.valid_page_sizes}')
+return os.path.join(self.path, f'hugepages-{sz}kB', 'nr_hugepages')
+
+def __getitem__(self, sz: int) -> int:
+'''Get current number of reserved pages of specified size'''
+with open(self._nr_pages_path(sz), encoding='utf-8') as f:
+return int(f.read())
+
+def __setitem__(self, sz: int, nr_pages: int) -> None:
+'''Set number of reserved pages of specified size'''
+with open(self._nr_pages_path(sz), 'w', encoding='utf-8') as f:
+f.write(f'{nr_pages}\n')
+
+
+def fmt_memsize(kb: int) -> str:
 '''Format memory size in kB into conventional format'''
 logk = int(log2(kb) / 10)
 suffix = BINARY_PREFIX[logk]
 unit = 2**(logk * 10)
-return '{}{}b'.format(int(kb / unit), suffix)
+return f'{int(kb / unit)}{suffix}b'
 
 
-def get_memsize(arg):
+def get_memsize(arg: str) -> int:
 '''Convert memory size with suffix to kB'&#

[PATCH v3 4/4] usertools/dpdk-devbind: print NUMA node

2024-08-20 Thread Anatoly Burakov

Currently, devbind does not print out any NUMA information, which makes
figuring out which NUMA node device belongs to not trivial. Add printouts
for NUMA information if NUMA support is enabled on the system.

Signed-off-by: Anatoly Burakov 
Acked-by: Robin Jarry 
---

Notes:
v1 -> v2:
- Added commit to print out NUMA information in devbind

 usertools/dpdk-devbind.py | 29 +
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py
index b276e8efc8..078e8c387b 100755
--- a/usertools/dpdk-devbind.py
+++ b/usertools/dpdk-devbind.py
@@ -110,6 +110,11 @@
 args = []
 
 
+# check if this system has NUMA support
+def is_numa():
+return os.path.exists('/sys/devices/system/node')
+
+
 # check if a specific kernel module is loaded
 def module_is_loaded(module):
 global loaded_modules
@@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, 
if_field=False):
 print("".join('=' * len(msg)))
 return
 
+print_numa = is_numa()
+
 # print each category separately, so we can clearly see what's used by DPDK
 if dpdk_drv:
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
 display_devices("%s devices using DPDK-compatible driver" % 
device_name,
-dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s")
+dpdk_drv, extra_param)
 if kernel_drv:
-if_text = ""
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
 if if_field:
-if_text = "if=%(Interface)s "
-display_devices("%s devices using kernel driver" % device_name, 
kernel_drv,
-if_text + "drv=%(Driver_str)s "
-"unused=%(Module_str)s %(Active)s")
+extra_param = "if=%(Interface)s " + extra_param
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("%s devices using kernel driver" % device_name,
+kernel_drv, extra_param)
 if no_drv:
-display_devices("Other %s devices" % device_name, no_drv,
-"unused=%(Module_str)s")
+extra_param = "unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("Other %s devices" % device_name, no_drv, extra_param)
 
 
 def show_status():
-- 
2.43.5

[PATCH v4 2/4] usertools/cpu_layout: print out NUMA nodes

2024-08-21 Thread Anatoly Burakov

In traditional NUMA case, NUMA nodes and physical sockets were used
interchangeably, but there are cases where there can be multiple NUMA
nodes per socket, as well as all CPU's being assigned NUMA node 0 even in
cases of multiple sockets. Use sysfs to print out NUMA information.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2 -> v3:
- Sort imports alphabetically

 usertools/cpu_layout.py | 36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 8812ea286b..e4720e27db 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -5,6 +5,7 @@
 
 """Display CPU topology information."""
 
+import glob
 import typing as T
 
 
@@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str:
 return fd.read().strip()
 
 
+def read_numa_node(base: str) -> int:
+"""Read the NUMA node of a CPU."""
+node_glob = f"{base}/node*"
+node_dirs = glob.glob(node_glob)
+if not node_dirs:
+return 0  # default to node 0
+return int(node_dirs[0].split("node")[1])
+
+
 def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
 """Print a row of a table with the given column widths."""
 first, *rest = row
 w_first, *w_rest = col_widths
 first_end = " " * 4
-rest_end = " " * 10
+rest_end = " " * 4
 
 print(first.ljust(w_first), end=first_end)
 for cell, width in zip(rest, w_rest):
@@ -56,6 +66,7 @@ def main() -> None:
 sockets_s: T.Set[int] = set()
 cores_s: T.Set[int] = set()
 core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+numa_map: T.Dict[int, int] = {}
 base_path = "/sys/devices/system/cpu"
 
 cpus = range_expand(read_sysfs(f"{base_path}/online"))
@@ -64,12 +75,14 @@ def main() -> None:
 lcore_base = f"{base_path}/cpu{cpu}"
 core = int(read_sysfs(f"{lcore_base}/topology/core_id"))
 socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+node = read_numa_node(lcore_base)
 
 cores_s.add(core)
 sockets_s.add(socket)
 key = (socket, core)
 core_map.setdefault(key, [])
 core_map[key].append(cpu)
+numa_map[cpu] = node
 
 cores = sorted(cores_s)
 sockets = sorted(sockets_s)
@@ -80,24 +93,37 @@ def main() -> None:
 
 print("cores = ", cores)
 print("sockets = ", sockets)
+print("numa = ", sorted(set(numa_map.values(
 print()
 
-# Core, [Socket, Socket, ...]
-heading_strs = "", *[f"Socket {s}" for s in sockets]
+# Core, [NUMA, Socket, NUMA, Socket, ...]
+heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")]
 sep_strs = tuple("-" * len(hstr) for hstr in heading_strs)
 rows: T.List[T.Tuple[str, ...]] = []
 
+prev_numa = None
 for c in cores:
 # Core,
 row: T.Tuple[str, ...] = (f"Core {c}",)
 
-# [lcores, lcores, ...]
+# assume NUMA changes symmetrically
+first_lcore = core_map[(0, c)][0]
+cur_numa = numa_map[first_lcore]
+numa_changed = prev_numa != cur_numa
+prev_numa = cur_numa
+
+# [NUMA, lcores, NUMA, lcores, ...]
 for s in sockets:
 try:
 lcores = core_map[(s, c)]
+numa = numa_map[lcores[0]]
+if numa_changed:
+row += (f"NUMA {numa}",)
+else:
+row += ("",)
 row += (str(lcores),)
 except KeyError:
-row += ("",)
+row += ("", "")
 rows += [row]
 
 # find max widths for each column, including header and rows
-- 
2.43.5

[PATCH v4 4/4] usertools/dpdk-devbind: print NUMA node

2024-08-21 Thread Anatoly Burakov

Currently, devbind does not print out any NUMA information, which makes
figuring out which NUMA node device belongs to not trivial. Add printouts
for NUMA information if NUMA support is enabled on the system.

Signed-off-by: Anatoly Burakov 
Acked-by: Robin Jarry 
---

Notes:
v1 -> v2:
- Added commit to print out NUMA information in devbind

 usertools/dpdk-devbind.py | 29 +
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py
index b276e8efc8..078e8c387b 100755
--- a/usertools/dpdk-devbind.py
+++ b/usertools/dpdk-devbind.py
@@ -110,6 +110,11 @@
 args = []
 
 
+# check if this system has NUMA support
+def is_numa():
+return os.path.exists('/sys/devices/system/node')
+
+
 # check if a specific kernel module is loaded
 def module_is_loaded(module):
 global loaded_modules
@@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, 
if_field=False):
 print("".join('=' * len(msg)))
 return
 
+print_numa = is_numa()
+
 # print each category separately, so we can clearly see what's used by DPDK
 if dpdk_drv:
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
 display_devices("%s devices using DPDK-compatible driver" % 
device_name,
-dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s")
+dpdk_drv, extra_param)
 if kernel_drv:
-if_text = ""
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
 if if_field:
-if_text = "if=%(Interface)s "
-display_devices("%s devices using kernel driver" % device_name, 
kernel_drv,
-if_text + "drv=%(Driver_str)s "
-"unused=%(Module_str)s %(Active)s")
+extra_param = "if=%(Interface)s " + extra_param
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("%s devices using kernel driver" % device_name,
+kernel_drv, extra_param)
 if no_drv:
-display_devices("Other %s devices" % device_name, no_drv,
-"unused=%(Module_str)s")
+extra_param = "unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("Other %s devices" % device_name, no_drv, extra_param)
 
 
 def show_status():
-- 
2.43.5

[PATCH v4 3/4] usertools/dpdk-hugepages.py: update coding style

2024-08-21 Thread Anatoly Burakov

Update coding style:

- Make the code PEP-484 compliant
- Add more comments, improve readability, use f-strings everywhere
- Address all Python static analysis (e.g. mypy, pylint) warnings
- Format code with Ruff
- Improve error handling
- Refactor printing and sysfs/procfs access functions
- Sort output by NUMA node

Signed-off-by: Anatoly Burakov 
Acked-by: Stephen Hemminger 
---

Notes:
v3 -> v4:
  - Format code with Ruff, line width 79 to avoid flake8 warnings
(Flake8 is by default configured with line width 79 on my system)
v2 -> v3:
  - Rewrite of the script as suggested by reviewers
v1 -> v2:
  - Added commit that sorted output by NUMA node

 usertools/dpdk-hugepages.py | 524 ++--
 1 file changed, 315 insertions(+), 209 deletions(-)

diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py
index bf2575ba36..4c99682848 100755
--- a/usertools/dpdk-hugepages.py
+++ b/usertools/dpdk-hugepages.py
@@ -1,13 +1,15 @@
 #! /usr/bin/env python3
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2020 Microsoft Corporation
+
 """Script to query and setup huge pages for DPDK applications."""
 
 import argparse
-import glob
 import os
 import re
+import subprocess
 import sys
+import typing as T
 from math import log2
 
 # Standard binary prefix
@@ -15,194 +17,268 @@
 
 # systemd mount point for huge pages
 HUGE_MOUNT = "/dev/hugepages"
+# default directory for non-NUMA huge pages
+NO_NUMA_HUGE_DIR = "/sys/kernel/mm/hugepages"
+# default base directory for NUMA nodes
+NUMA_NODE_BASE_DIR = "/sys/devices/system/node"
+# procfs paths
+MEMINFO_PATH = "/proc/meminfo"
+MOUNTS_PATH = "/proc/mounts"
 
 
-def fmt_memsize(kb):
-'''Format memory size in kB into conventional format'''
+class HugepageMount:
+"""Mount operations for huge page filesystem."""
+
+def __init__(self, path: str, mounted: bool):
+self.path = path
+# current mount status
+self.mounted = mounted
+
+def mount(
+self, pagesize_kb: int, user: T.Optional[str], group: T.Optional[str]
+) -> None:
+"""Mount the huge TLB file system"""
+if self.mounted:
+return
+cmd = ["mount", "-t", "hugetlbfs"]
+cmd += ["-o", f"pagesize={pagesize_kb * 1024}"]
+if user is not None:
+cmd += ["-o", f"uid={user}"]
+if group is not None:
+cmd += ["-o", f"gid={group}"]
+cmd += ["nodev", self.path]
+
+subprocess.run(cmd, check=True)
+self.mounted = True
+
+def unmount(self) -> None:
+"""Unmount the huge TLB file system (if mounted)"""
+if self.mounted:
+subprocess.run(["umount", self.path], check=True)
+self.mounted = False
+
+
+class HugepageRes:
+"""Huge page reserve operations. Can be NUMA-node-specific."""
+
+def __init__(self, path: str, node: T.Optional[int] = None):
+self.path = path
+# if this is a per-NUMA node huge page dir, store the node number
+self.node = node
+self.valid_page_sizes = self._get_valid_page_sizes()
+
+def _get_valid_page_sizes(self) -> T.List[int]:
+"""Extract valid huge page sizes"""
+return [get_memsize(d.split("-")[1]) for d in os.listdir(self.path)]
+
+def _nr_pages_path(self, sz: int) -> str:
+if sz not in self.valid_page_sizes:
+raise ValueError(
+f"Invalid page size {sz}. "
+f"Valid sizes: {self.valid_page_sizes}"
+)
+return os.path.join(self.path, f"hugepages-{sz}kB", "nr_hugepages")
+
+def __getitem__(self, sz: int) -> int:
+"""Get current number of reserved pages of specified size"""
+with open(self._nr_pages_path(sz), encoding="utf-8") as f:
+return int(f.read())
+
+def __setitem__(self, sz: int, nr_pages: int) -> None:
+"""Set number of reserved pages of specified size"""
+with open(self._nr_pages_path(sz), "w", encoding="utf-8") as f:
+f.write(f"{nr_pages}\n")
+
+
+def fmt_memsize(kb: int) -> str:
+"""Format memory size in kB into conventional format"""
 logk = int(log2(kb) / 10)
 suffix = BINARY_PREFIX[logk]
-unit = 2**(logk * 10)
-return '{}{}b'.format(int(kb / unit), suffix)
+unit = 2 ** (logk * 10)
+return f"{int(kb / unit)}{suffix}b"

[PATCH v4 1/4] usertools/cpu_layout: update coding style

2024-08-21 Thread Anatoly Burakov

Update coding style:

- make it PEP-484 compliant
- address all flake8, mypy etc. warnings
- use f-strings in place of old-style string interpolation
- refactor printing to make the code more readable
- read valid CPU ID's from "online" sysfs node

Signed-off-by: Anatoly Burakov 
---

Notes:
v3->v4:
- Format with Ruff, line width 79

v1,v2 -> v3:
- Import typing as T instead of individual types

 usertools/cpu_layout.py | 163 ++--
 1 file changed, 108 insertions(+), 55 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..8812ea286b 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -3,62 +3,115 @@
 # Copyright(c) 2010-2014 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
-sockets = []
-cores = []
-core_map = {}
-base_path = "/sys/devices/system/cpu"
-fd = open("{}/kernel_max".format(base_path))
-max_cpus = int(fd.read())
-fd.close()
-for cpu in range(max_cpus + 1):
-try:
-fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu))
-except IOError:
-continue
-core = int(fd.read())
-fd.close()
-fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
-socket = int(fd.read())
-fd.close()
-if core not in cores:
-cores.append(core)
-if socket not in sockets:
-sockets.append(socket)
-key = (socket, core)
-if key not in core_map:
-core_map[key] = []
-core_map[key].append(cpu)
+"""Display CPU topology information."""
 
-print(format("=" * (47 + len(base_path
-print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
-print("cores = ", cores)
-print("sockets = ", sockets)
-print("")
+import typing as T
 
-max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1))
-max_thread_count = len(list(core_map.values())[0])
-max_core_map_len = (max_processor_len * max_thread_count)  \
-  + len(", ") * (max_thread_count - 1) \
-  + len('[]') + len('Socket ')
-max_core_id_len = len(str(max(cores)))
 
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
-print(output)
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " ".ljust(max_core_map_len)
-output += " "
-print(output)
-
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
+def range_expand(rstr: str) -> T.List[int]:
+"""Expand a range string into a list of integers."""
+# 0,1-3 => [0, 1-3]
+ranges = rstr.split(",")
+valset: T.List[int] = []
+for r in ranges:
+# 1-3 => [1, 2, 3]
+if "-" in r:
+start, end = r.split("-")
+valset.extend(range(int(start), int(end) + 1))
 else:
-output += " " * (max_core_map_len + 1)
-print(output)
+valset.append(int(r))
+return valset
+
+
+def read_sysfs(path: str) -> str:
+"""Read a sysfs file and return its contents."""
+with open(path, encoding="utf-8") as fd:
+return fd.read().strip()
+
+
+def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
+"""Print a row of a table with the given column widths."""
+first, *rest = row
+w_first, *w_rest = col_widths
+first_end = " " * 4
+rest_end = " " * 10
+
+print(first.ljust(w_first), end=first_end)
+for cell, width in zip(rest, w_rest):
+print(cell.rjust(width), end=rest_end)
+print()
+
+
+def print_section(heading: str) -> None:
+"""Print a section heading."""
+sep = "=" * len(heading)
+print(sep)
+print(heading)
+print(sep)
+print()
+
+
+def main() -> None:
+"""Print CPU topology information."""
+sockets_s: T.Set[int] = set()
+cores_s: T.Set[int] = set()
+core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+base_path = "/sys/devices/system/cpu"
+
+cpus = range_expand(read_sysfs(f"{base_path}/online"))
+
+for cpu in cpus:
+lcore_base = f"{base_path}/cpu{cpu}"
+core = int(read_sysfs(f"{lcore_base}/topology/core_id"))
+s

[PATCH v5 1/4] usertools/cpu_layout: update coding style

2024-08-21 Thread Anatoly Burakov

Update coding style:

- make it PEP-484 compliant
- format code with Ruff
- address all mypy etc. warnings
- use f-strings in place of old-style string interpolation
- refactor printing to make the code more readable
- read valid CPU ID's from "online" sysfs node

Signed-off-by: Anatoly Burakov 
---

Notes:
v4-v5:
- Format with Ruff on default settings

v3->v4:
- Format with Ruff, line width 79

v1,v2 -> v3:
- Import typing as T instead of individual types

 usertools/cpu_layout.py | 161 ++--
 1 file changed, 106 insertions(+), 55 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..e133fb8ad3 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -3,62 +3,113 @@
 # Copyright(c) 2010-2014 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
-sockets = []
-cores = []
-core_map = {}
-base_path = "/sys/devices/system/cpu"
-fd = open("{}/kernel_max".format(base_path))
-max_cpus = int(fd.read())
-fd.close()
-for cpu in range(max_cpus + 1):
-try:
-fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu))
-except IOError:
-continue
-core = int(fd.read())
-fd.close()
-fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
-socket = int(fd.read())
-fd.close()
-if core not in cores:
-cores.append(core)
-if socket not in sockets:
-sockets.append(socket)
-key = (socket, core)
-if key not in core_map:
-core_map[key] = []
-core_map[key].append(cpu)
+"""Display CPU topology information."""
 
-print(format("=" * (47 + len(base_path
-print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
-print("cores = ", cores)
-print("sockets = ", sockets)
-print("")
+import typing as T
 
-max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1))
-max_thread_count = len(list(core_map.values())[0])
-max_core_map_len = (max_processor_len * max_thread_count)  \
-  + len(", ") * (max_thread_count - 1) \
-  + len('[]') + len('Socket ')
-max_core_id_len = len(str(max(cores)))
 
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
-print(output)
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " ".ljust(max_core_map_len)
-output += " "
-print(output)
-
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
+def range_expand(rstr: str) -> T.List[int]:
+"""Expand a range string into a list of integers."""
+# 0,1-3 => [0, 1-3]
+ranges = rstr.split(",")
+valset: T.List[int] = []
+for r in ranges:
+# 1-3 => [1, 2, 3]
+if "-" in r:
+start, end = r.split("-")
+valset.extend(range(int(start), int(end) + 1))
 else:
-output += " " * (max_core_map_len + 1)
-print(output)
+valset.append(int(r))
+return valset
+
+
+def read_sysfs(path: str) -> str:
+"""Read a sysfs file and return its contents."""
+with open(path, encoding="utf-8") as fd:
+return fd.read().strip()
+
+
+def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
+"""Print a row of a table with the given column widths."""
+first, *rest = row
+w_first, *w_rest = col_widths
+first_end = " " * 4
+rest_end = " " * 10
+
+print(first.ljust(w_first), end=first_end)
+for cell, width in zip(rest, w_rest):
+print(cell.rjust(width), end=rest_end)
+print()
+
+
+def print_section(heading: str) -> None:
+"""Print a section heading."""
+sep = "=" * len(heading)
+print(sep)
+print(heading)
+print(sep)
+print()
+
+
+def main() -> None:
+"""Print CPU topology information."""
+sockets_s: T.Set[int] = set()
+cores_s: T.Set[int] = set()
+core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+base_path = "/sys/devices/system/cpu"
+
+cpus = range_expand(read_sysfs(f"{base_path}/online"))
+
+for cpu in cpus:
+lcore_base = f"{base_path}/cpu{cpu}"
+

[PATCH v5 2/4] usertools/cpu_layout: print out NUMA nodes

2024-08-21 Thread Anatoly Burakov

In traditional NUMA case, NUMA nodes and physical sockets were used
interchangeably, but there are cases where there can be multiple NUMA
nodes per socket, as well as all CPU's being assigned NUMA node 0 even in
cases of multiple sockets. Use sysfs to print out NUMA information.

Signed-off-by: Anatoly Burakov 
---

Notes:
v2 -> v3:
- Sort imports alphabetically

 usertools/cpu_layout.py | 36 +++-
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index e133fb8ad3..976be1f8b2 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -5,6 +5,7 @@
 
 """Display CPU topology information."""
 
+import glob
 import typing as T
 
 
@@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str:
 return fd.read().strip()
 
 
+def read_numa_node(base: str) -> int:
+"""Read the NUMA node of a CPU."""
+node_glob = f"{base}/node*"
+node_dirs = glob.glob(node_glob)
+if not node_dirs:
+return 0  # default to node 0
+return int(node_dirs[0].split("node")[1])
+
+
 def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
 """Print a row of a table with the given column widths."""
 first, *rest = row
 w_first, *w_rest = col_widths
 first_end = " " * 4
-rest_end = " " * 10
+rest_end = " " * 4
 
 print(first.ljust(w_first), end=first_end)
 for cell, width in zip(rest, w_rest):
@@ -56,6 +66,7 @@ def main() -> None:
 sockets_s: T.Set[int] = set()
 cores_s: T.Set[int] = set()
 core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+numa_map: T.Dict[int, int] = {}
 base_path = "/sys/devices/system/cpu"
 
 cpus = range_expand(read_sysfs(f"{base_path}/online"))
@@ -64,12 +75,14 @@ def main() -> None:
 lcore_base = f"{base_path}/cpu{cpu}"
 core = int(read_sysfs(f"{lcore_base}/topology/core_id"))
 socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+node = read_numa_node(lcore_base)
 
 cores_s.add(core)
 sockets_s.add(socket)
 key = (socket, core)
 core_map.setdefault(key, [])
 core_map[key].append(cpu)
+numa_map[cpu] = node
 
 cores = sorted(cores_s)
 sockets = sorted(sockets_s)
@@ -78,24 +91,37 @@ def main() -> None:
 
 print("cores = ", cores)
 print("sockets = ", sockets)
+print("numa = ", sorted(set(numa_map.values(
 print()
 
-# Core, [Socket, Socket, ...]
-heading_strs = "", *[f"Socket {s}" for s in sockets]
+# Core, [NUMA, Socket, NUMA, Socket, ...]
+heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")]
 sep_strs = tuple("-" * len(hstr) for hstr in heading_strs)
 rows: T.List[T.Tuple[str, ...]] = []
 
+prev_numa = None
 for c in cores:
 # Core,
 row: T.Tuple[str, ...] = (f"Core {c}",)
 
-# [lcores, lcores, ...]
+# assume NUMA changes symmetrically
+first_lcore = core_map[(0, c)][0]
+cur_numa = numa_map[first_lcore]
+numa_changed = prev_numa != cur_numa
+prev_numa = cur_numa
+
+# [NUMA, lcores, NUMA, lcores, ...]
 for s in sockets:
 try:
 lcores = core_map[(s, c)]
+numa = numa_map[lcores[0]]
+if numa_changed:
+row += (f"NUMA {numa}",)
+else:
+row += ("",)
 row += (str(lcores),)
 except KeyError:
-row += ("",)
+row += ("", "")
 rows += [row]
 
 # find max widths for each column, including header and rows
-- 
2.43.5

[PATCH v5 3/4] usertools/dpdk-hugepages.py: update coding style

2024-08-21 Thread Anatoly Burakov

Update coding style:

- make the code PEP-484 compliant
- add more comments, improve readability, use f-strings everywhere
- address all Python static analysis (e.g. mypy, pylint) warnings
- format code with Ruff
- improve error handling
- refactor printing and sysfs/procfs access functions
- sort huge page reservation status output by NUMA node

Signed-off-by: Anatoly Burakov 
Acked-by: Stephen Hemminger 
---

Notes:
v4 -> v5:
- Format with Ruff on default settings
- Replaced all instances of raw path strings with os.path.join
v3 -> v4:
- Format code with Ruff, line width 79 to avoid flake8 warnings
  (Flake8 is by default configured with line width 79 on my system)
v2 -> v3:
- Rewrite of the script as suggested by reviewers
v1 -> v2:
- Added commit that sorted output by NUMA node

 usertools/dpdk-hugepages.py | 518 +---
 1 file changed, 310 insertions(+), 208 deletions(-)

diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py
index bf2575ba36..3fc3269c83 100755
--- a/usertools/dpdk-hugepages.py
+++ b/usertools/dpdk-hugepages.py
@@ -1,13 +1,15 @@
 #! /usr/bin/env python3
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2020 Microsoft Corporation
+
 """Script to query and setup huge pages for DPDK applications."""
 
 import argparse
-import glob
 import os
 import re
+import subprocess
 import sys
+import typing as T
 from math import log2
 
 # Standard binary prefix
@@ -15,194 +17,266 @@
 
 # systemd mount point for huge pages
 HUGE_MOUNT = "/dev/hugepages"
+# default directory for non-NUMA huge pages
+NO_NUMA_HUGE_DIR = "/sys/kernel/mm/hugepages"
+# default base directory for NUMA nodes
+NUMA_NODE_BASE_DIR = "/sys/devices/system/node"
+# procfs paths
+MEMINFO_PATH = "/proc/meminfo"
+MOUNTS_PATH = "/proc/mounts"
 
 
-def fmt_memsize(kb):
-'''Format memory size in kB into conventional format'''
+class HugepageMount:
+"""Mount operations for huge page filesystem."""
+
+def __init__(self, path: str, mounted: bool):
+self.path = path
+# current mount status
+self.mounted = mounted
+
+def mount(
+self, pagesize_kb: int, user: T.Optional[str], group: T.Optional[str]
+) -> None:
+"""Mount the huge TLB file system"""
+if self.mounted:
+return
+cmd = ["mount", "-t", "hugetlbfs"]
+cmd += ["-o", f"pagesize={pagesize_kb * 1024}"]
+if user is not None:
+cmd += ["-o", f"uid={user}"]
+if group is not None:
+cmd += ["-o", f"gid={group}"]
+cmd += ["nodev", self.path]
+
+subprocess.run(cmd, check=True)
+self.mounted = True
+
+def unmount(self) -> None:
+"""Unmount the huge TLB file system (if mounted)"""
+if self.mounted:
+subprocess.run(["umount", self.path], check=True)
+self.mounted = False
+
+
+class HugepageRes:
+"""Huge page reserve operations. Can be NUMA-node-specific."""
+
+def __init__(self, path: str, node: T.Optional[int] = None):
+self.path = path
+# if this is a per-NUMA node huge page dir, store the node number
+self.node = node
+self.valid_page_sizes = self._get_valid_page_sizes()
+
+def _get_valid_page_sizes(self) -> T.List[int]:
+"""Extract valid huge page sizes"""
+return [get_memsize(d.split("-")[1]) for d in os.listdir(self.path)]
+
+def _nr_pages_path(self, sz: int) -> str:
+if sz not in self.valid_page_sizes:
+raise ValueError(
+f"Invalid page size {sz}. " f"Valid sizes: 
{self.valid_page_sizes}"
+)
+return os.path.join(self.path, f"hugepages-{sz}kB", "nr_hugepages")
+
+def __getitem__(self, sz: int) -> int:
+"""Get current number of reserved pages of specified size"""
+with open(self._nr_pages_path(sz), encoding="utf-8") as f:
+return int(f.read())
+
+def __setitem__(self, sz: int, nr_pages: int) -> None:
+"""Set number of reserved pages of specified size"""
+with open(self._nr_pages_path(sz), "w", encoding="utf-8") as f:
+f.write(f"{nr_pages}\n")
+
+
+def fmt_memsize(kb: int) -> str:
+"""Format memory size in kB into conventional format"""
 logk = int(log2(kb) / 10)
 suffix = BINARY_PREFIX[logk]
-unit = 2**(logk * 10)
-re

[PATCH v5 4/4] usertools/dpdk-devbind: print NUMA node

2024-08-21 Thread Anatoly Burakov

Currently, devbind does not print out any NUMA information, which makes
figuring out which NUMA node device belongs to not trivial. Add printouts
for NUMA information if NUMA support is enabled on the system.

Signed-off-by: Anatoly Burakov 
Acked-by: Robin Jarry 
---

Notes:
v1 -> v2:
- Added commit to print out NUMA information in devbind

 usertools/dpdk-devbind.py | 29 +
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py
index b276e8efc8..078e8c387b 100755
--- a/usertools/dpdk-devbind.py
+++ b/usertools/dpdk-devbind.py
@@ -110,6 +110,11 @@
 args = []
 
 
+# check if this system has NUMA support
+def is_numa():
+return os.path.exists('/sys/devices/system/node')
+
+
 # check if a specific kernel module is loaded
 def module_is_loaded(module):
 global loaded_modules
@@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, 
if_field=False):
 print("".join('=' * len(msg)))
 return
 
+print_numa = is_numa()
+
 # print each category separately, so we can clearly see what's used by DPDK
 if dpdk_drv:
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
 display_devices("%s devices using DPDK-compatible driver" % 
device_name,
-dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s")
+dpdk_drv, extra_param)
 if kernel_drv:
-if_text = ""
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
 if if_field:
-if_text = "if=%(Interface)s "
-display_devices("%s devices using kernel driver" % device_name, 
kernel_drv,
-if_text + "drv=%(Driver_str)s "
-"unused=%(Module_str)s %(Active)s")
+extra_param = "if=%(Interface)s " + extra_param
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("%s devices using kernel driver" % device_name,
+kernel_drv, extra_param)
 if no_drv:
-display_devices("Other %s devices" % device_name, no_drv,
-"unused=%(Module_str)s")
+extra_param = "unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("Other %s devices" % device_name, no_drv, extra_param)
 
 
 def show_status():
-- 
2.43.5

[PATCH v6 1/4] usertools/cpu_layout: update coding style

2024-08-22 Thread Anatoly Burakov

Update coding style:

- make it PEP-484 compliant
- format code with Ruff
- address all mypy etc. warnings
- use f-strings in place of old-style string interpolation
- refactor printing to make the code more readable
- read valid CPU ID's from "online" sysfs node

Signed-off-by: Anatoly Burakov 
Acked-by: Robin Jarry 
---

Notes:
v4-v5:
- Format with Ruff on default settings

v3->v4:
- Format with Ruff, line width 79

v1,v2 -> v3:
- Import typing as T instead of individual types

 usertools/cpu_layout.py | 161 ++--
 1 file changed, 106 insertions(+), 55 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..e133fb8ad3 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -3,62 +3,113 @@
 # Copyright(c) 2010-2014 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
-sockets = []
-cores = []
-core_map = {}
-base_path = "/sys/devices/system/cpu"
-fd = open("{}/kernel_max".format(base_path))
-max_cpus = int(fd.read())
-fd.close()
-for cpu in range(max_cpus + 1):
-try:
-fd = open("{}/cpu{}/topology/core_id".format(base_path, cpu))
-except IOError:
-continue
-core = int(fd.read())
-fd.close()
-fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
-socket = int(fd.read())
-fd.close()
-if core not in cores:
-cores.append(core)
-if socket not in sockets:
-sockets.append(socket)
-key = (socket, core)
-if key not in core_map:
-core_map[key] = []
-core_map[key].append(cpu)
+"""Display CPU topology information."""
 
-print(format("=" * (47 + len(base_path
-print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
-print("cores = ", cores)
-print("sockets = ", sockets)
-print("")
+import typing as T
 
-max_processor_len = len(str(len(cores) * len(sockets) * 2 - 1))
-max_thread_count = len(list(core_map.values())[0])
-max_core_map_len = (max_processor_len * max_thread_count)  \
-  + len(", ") * (max_thread_count - 1) \
-  + len('[]') + len('Socket ')
-max_core_id_len = len(str(max(cores)))
 
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
-print(output)
-
-output = " ".ljust(max_core_id_len + len('Core '))
-for s in sockets:
-output += " ".ljust(max_core_map_len)
-output += " "
-print(output)
-
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
+def range_expand(rstr: str) -> T.List[int]:
+"""Expand a range string into a list of integers."""
+# 0,1-3 => [0, 1-3]
+ranges = rstr.split(",")
+valset: T.List[int] = []
+for r in ranges:
+# 1-3 => [1, 2, 3]
+if "-" in r:
+start, end = r.split("-")
+valset.extend(range(int(start), int(end) + 1))
 else:
-output += " " * (max_core_map_len + 1)
-print(output)
+valset.append(int(r))
+return valset
+
+
+def read_sysfs(path: str) -> str:
+"""Read a sysfs file and return its contents."""
+with open(path, encoding="utf-8") as fd:
+return fd.read().strip()
+
+
+def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
+"""Print a row of a table with the given column widths."""
+first, *rest = row
+w_first, *w_rest = col_widths
+first_end = " " * 4
+rest_end = " " * 10
+
+print(first.ljust(w_first), end=first_end)
+for cell, width in zip(rest, w_rest):
+print(cell.rjust(width), end=rest_end)
+print()
+
+
+def print_section(heading: str) -> None:
+"""Print a section heading."""
+sep = "=" * len(heading)
+print(sep)
+print(heading)
+print(sep)
+print()
+
+
+def main() -> None:
+"""Print CPU topology information."""
+sockets_s: T.Set[int] = set()
+cores_s: T.Set[int] = set()
+core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+base_path = "/sys/devices/system/cpu"
+
+cpus = range_expand(read_sysfs(f"{base_path}/online"))
+
+for cpu in cpus:
+lcore_base = f"{base_path}/

[PATCH v6 2/4] usertools/cpu_layout: print out NUMA nodes

2024-08-22 Thread Anatoly Burakov

In traditional NUMA case, NUMA nodes and physical sockets were used
interchangeably, but there are cases where there can be multiple NUMA
nodes per socket, as well as all CPU's being assigned NUMA node 0 even in
cases of multiple sockets. Use sysfs to print out NUMA information.

Signed-off-by: Anatoly Burakov 
---

Notes:
v5 -> v6:
- Track NUMA changes per socket to avoid issues with missing cores

v2 -> v3:
- Sort imports alphabetically

 usertools/cpu_layout.py | 35 ++-
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index e133fb8ad3..5972cfecdb 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -5,6 +5,7 @@
 
 """Display CPU topology information."""
 
+import glob
 import typing as T
 
 
@@ -29,12 +30,21 @@ def read_sysfs(path: str) -> str:
 return fd.read().strip()
 
 
+def read_numa_node(base: str) -> int:
+"""Read the NUMA node of a CPU."""
+node_glob = f"{base}/node*"
+node_dirs = glob.glob(node_glob)
+if not node_dirs:
+return 0  # default to node 0
+return int(node_dirs[0].split("node")[1])
+
+
 def print_row(row: T.Tuple[str, ...], col_widths: T.List[int]) -> None:
 """Print a row of a table with the given column widths."""
 first, *rest = row
 w_first, *w_rest = col_widths
 first_end = " " * 4
-rest_end = " " * 10
+rest_end = " " * 4
 
 print(first.ljust(w_first), end=first_end)
 for cell, width in zip(rest, w_rest):
@@ -56,6 +66,7 @@ def main() -> None:
 sockets_s: T.Set[int] = set()
 cores_s: T.Set[int] = set()
 core_map: T.Dict[T.Tuple[int, int], T.List[int]] = {}
+numa_map: T.Dict[int, int] = {}
 base_path = "/sys/devices/system/cpu"
 
 cpus = range_expand(read_sysfs(f"{base_path}/online"))
@@ -64,12 +75,14 @@ def main() -> None:
 lcore_base = f"{base_path}/cpu{cpu}"
 core = int(read_sysfs(f"{lcore_base}/topology/core_id"))
 socket = int(read_sysfs(f"{lcore_base}/topology/physical_package_id"))
+node = read_numa_node(lcore_base)
 
 cores_s.add(core)
 sockets_s.add(socket)
 key = (socket, core)
 core_map.setdefault(key, [])
 core_map[key].append(cpu)
+numa_map[cpu] = node
 
 cores = sorted(cores_s)
 sockets = sorted(sockets_s)
@@ -78,24 +91,36 @@ def main() -> None:
 
 print("cores = ", cores)
 print("sockets = ", sockets)
+print("numa = ", sorted(set(numa_map.values(
 print()
 
-# Core, [Socket, Socket, ...]
-heading_strs = "", *[f"Socket {s}" for s in sockets]
+# Core, [NUMA, Socket, NUMA, Socket, ...]
+heading_strs = "", *[v for s in sockets for v in ("", f"Socket {s}")]
 sep_strs = tuple("-" * len(hstr) for hstr in heading_strs)
 rows: T.List[T.Tuple[str, ...]] = []
 
+# track NUMA changes per socket
+prev_numa: T.Dict[int, T.Optional[int]] = {socket: None for socket in 
sockets}
 for c in cores:
 # Core,
 row: T.Tuple[str, ...] = (f"Core {c}",)
 
-# [lcores, lcores, ...]
+# [NUMA, lcores, NUMA, lcores, ...]
 for s in sockets:
 try:
 lcores = core_map[(s, c)]
+
+numa = numa_map[lcores[0]]
+numa_changed = prev_numa[s] != numa
+prev_numa[s] = numa
+
+if numa_changed:
+row += (f"NUMA {numa}",)
+else:
+row += ("",)
 row += (str(lcores),)
 except KeyError:
-row += ("",)
+row += ("", "")
 rows += [row]
 
 # find max widths for each column, including header and rows
-- 
2.43.5

[PATCH v6 3/4] usertools/dpdk-hugepages.py: update coding style

2024-08-22 Thread Anatoly Burakov

Update coding style:

- make the code PEP-484 compliant
- add more comments, improve readability, use f-strings everywhere
- address all Python static analysis (e.g. mypy, pylint) warnings
- format code with Ruff
- improve error handling
- refactor printing and sysfs/procfs access functions
- sort huge page reservation status output by NUMA node

Signed-off-by: Anatoly Burakov 
Acked-by: Stephen Hemminger 
Acked-by: Robin Jarry 
---

Notes:
v4 -> v5:
- Format with Ruff on default settings
- Replaced all instances of raw path strings with os.path.join
v3 -> v4:
- Format code with Ruff, line width 79 to avoid flake8 warnings
  (Flake8 is by default configured with line width 79 on my system)
v2 -> v3:
- Rewrite of the script as suggested by reviewers
v1 -> v2:
- Added commit that sorted output by NUMA node

 usertools/dpdk-hugepages.py | 518 +---
 1 file changed, 310 insertions(+), 208 deletions(-)

diff --git a/usertools/dpdk-hugepages.py b/usertools/dpdk-hugepages.py
index bf2575ba36..3fc3269c83 100755
--- a/usertools/dpdk-hugepages.py
+++ b/usertools/dpdk-hugepages.py
@@ -1,13 +1,15 @@
 #! /usr/bin/env python3
 # SPDX-License-Identifier: BSD-3-Clause
 # Copyright (c) 2020 Microsoft Corporation
+
 """Script to query and setup huge pages for DPDK applications."""
 
 import argparse
-import glob
 import os
 import re
+import subprocess
 import sys
+import typing as T
 from math import log2
 
 # Standard binary prefix
@@ -15,194 +17,266 @@
 
 # systemd mount point for huge pages
 HUGE_MOUNT = "/dev/hugepages"
+# default directory for non-NUMA huge pages
+NO_NUMA_HUGE_DIR = "/sys/kernel/mm/hugepages"
+# default base directory for NUMA nodes
+NUMA_NODE_BASE_DIR = "/sys/devices/system/node"
+# procfs paths
+MEMINFO_PATH = "/proc/meminfo"
+MOUNTS_PATH = "/proc/mounts"
 
 
-def fmt_memsize(kb):
-'''Format memory size in kB into conventional format'''
+class HugepageMount:
+"""Mount operations for huge page filesystem."""
+
+def __init__(self, path: str, mounted: bool):
+self.path = path
+# current mount status
+self.mounted = mounted
+
+def mount(
+self, pagesize_kb: int, user: T.Optional[str], group: T.Optional[str]
+) -> None:
+"""Mount the huge TLB file system"""
+if self.mounted:
+return
+cmd = ["mount", "-t", "hugetlbfs"]
+cmd += ["-o", f"pagesize={pagesize_kb * 1024}"]
+if user is not None:
+cmd += ["-o", f"uid={user}"]
+if group is not None:
+cmd += ["-o", f"gid={group}"]
+cmd += ["nodev", self.path]
+
+subprocess.run(cmd, check=True)
+self.mounted = True
+
+def unmount(self) -> None:
+"""Unmount the huge TLB file system (if mounted)"""
+if self.mounted:
+subprocess.run(["umount", self.path], check=True)
+self.mounted = False
+
+
+class HugepageRes:
+"""Huge page reserve operations. Can be NUMA-node-specific."""
+
+def __init__(self, path: str, node: T.Optional[int] = None):
+self.path = path
+# if this is a per-NUMA node huge page dir, store the node number
+self.node = node
+self.valid_page_sizes = self._get_valid_page_sizes()
+
+def _get_valid_page_sizes(self) -> T.List[int]:
+"""Extract valid huge page sizes"""
+return [get_memsize(d.split("-")[1]) for d in os.listdir(self.path)]
+
+def _nr_pages_path(self, sz: int) -> str:
+if sz not in self.valid_page_sizes:
+raise ValueError(
+f"Invalid page size {sz}. " f"Valid sizes: 
{self.valid_page_sizes}"
+)
+return os.path.join(self.path, f"hugepages-{sz}kB", "nr_hugepages")
+
+def __getitem__(self, sz: int) -> int:
+"""Get current number of reserved pages of specified size"""
+with open(self._nr_pages_path(sz), encoding="utf-8") as f:
+return int(f.read())
+
+def __setitem__(self, sz: int, nr_pages: int) -> None:
+"""Set number of reserved pages of specified size"""
+with open(self._nr_pages_path(sz), "w", encoding="utf-8") as f:
+f.write(f"{nr_pages}\n")
+
+
+def fmt_memsize(kb: int) -> str:
+"""Format memory size in kB into conventional format"""
 logk = int(log2(kb) / 10)
 suffix = BINARY_PREFIX[logk]
-unit = 2**(l

[PATCH v6 4/4] usertools/dpdk-devbind: print NUMA node

2024-08-22 Thread Anatoly Burakov

Currently, devbind does not print out any NUMA information, which makes
figuring out which NUMA node device belongs to not trivial. Add printouts
for NUMA information if NUMA support is enabled on the system.

Signed-off-by: Anatoly Burakov 
Acked-by: Robin Jarry 
---

Notes:
v1 -> v2:
- Added commit to print out NUMA information in devbind

 usertools/dpdk-devbind.py | 29 +
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/usertools/dpdk-devbind.py b/usertools/dpdk-devbind.py
index b276e8efc8..078e8c387b 100755
--- a/usertools/dpdk-devbind.py
+++ b/usertools/dpdk-devbind.py
@@ -110,6 +110,11 @@
 args = []
 
 
+# check if this system has NUMA support
+def is_numa():
+return os.path.exists('/sys/devices/system/node')
+
+
 # check if a specific kernel module is loaded
 def module_is_loaded(module):
 global loaded_modules
@@ -577,20 +582,28 @@ def show_device_status(devices_type, device_name, 
if_field=False):
 print("".join('=' * len(msg)))
 return
 
+print_numa = is_numa()
+
 # print each category separately, so we can clearly see what's used by DPDK
 if dpdk_drv:
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
 display_devices("%s devices using DPDK-compatible driver" % 
device_name,
-dpdk_drv, "drv=%(Driver_str)s unused=%(Module_str)s")
+dpdk_drv, extra_param)
 if kernel_drv:
-if_text = ""
+extra_param = "drv=%(Driver_str)s unused=%(Module_str)s"
 if if_field:
-if_text = "if=%(Interface)s "
-display_devices("%s devices using kernel driver" % device_name, 
kernel_drv,
-if_text + "drv=%(Driver_str)s "
-"unused=%(Module_str)s %(Active)s")
+extra_param = "if=%(Interface)s " + extra_param
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("%s devices using kernel driver" % device_name,
+kernel_drv, extra_param)
 if no_drv:
-display_devices("Other %s devices" % device_name, no_drv,
-"unused=%(Module_str)s")
+extra_param = "unused=%(Module_str)s"
+if print_numa:
+extra_param = "numa_node=%(NUMANode)s " + extra_param
+display_devices("Other %s devices" % device_name, no_drv, extra_param)
 
 
 def show_status():
-- 
2.43.5

[PATCH v1 1/3] fbarray: rename tests to be more meaningful

2024-08-23 Thread Anatoly Burakov

Some tests reference internal implementation details of fbarray, as well
as equivocate between mask and index. Most other test names are not
very descriptive. Rename them, and adjust comments to explain things
in terms of what the tests actually do, instead of referring to internal
implementation details.

Also, add more tests that fill up exactly one mask, with and without
neighbouring set bits.

Signed-off-by: Anatoly Burakov 
---
 app/test/test_fbarray.c | 99 -
 1 file changed, 67 insertions(+), 32 deletions(-)

diff --git a/app/test/test_fbarray.c b/app/test/test_fbarray.c
index 09f6907fb1..6ca509b898 100644
--- a/app/test/test_fbarray.c
+++ b/app/test/test_fbarray.c
@@ -104,7 +104,7 @@ static int first_msk_test_setup(void)
return init_aligned();
 }
 
-static int cross_msk_test_setup(void)
+static int contig_test_setup(void)
 {
/* put all within second and third mask */
param.start = 70;
@@ -112,7 +112,7 @@ static int cross_msk_test_setup(void)
return init_aligned();
 }
 
-static int multi_msk_test_setup(void)
+static int large_contig_test_setup(void)
 {
/* put all within first and last mask */
param.start = 3;
@@ -128,15 +128,39 @@ static int last_msk_test_setup(void)
return init_aligned();
 }
 
+static int full_index_test_setup(void)
+{
+   /* fill entire index */
+   param.start = 0;
+   param.end = FBARRAY_TEST_LEN - 1;
+   return init_aligned();
+}
+
 static int full_msk_test_setup(void)
 {
-   /* fill entire mask */
+   /* fill one mask */
param.start = 0;
-   param.end = FBARRAY_TEST_LEN - 1;
+   param.end = 63;
return init_aligned();
 }
 
-static int lookahead_test_setup(void)
+static int full_msk_contig_fwd_test_setup(void)
+{
+   /* fill one mask plus one item */
+   param.start = 64;
+   param.end = 128;
+   return init_aligned();
+}
+
+static int full_msk_contig_rev_test_setup(void)
+{
+   /* fill one mask plus one item */
+   param.start = 63;
+   param.end = 127;
+   return init_aligned();
+}
+
+static int cross_msk_test_setup(void)
 {
/* set index 64 as used */
param.start = 64;
@@ -144,7 +168,7 @@ static int lookahead_test_setup(void)
return init_aligned();
 }
 
-static int lookbehind_test_setup(void)
+static int cross_msk_rev_test_setup(void)
 {
/* set index 63 as used */
param.start = 63;
@@ -160,6 +184,13 @@ static int unaligned_test_setup(void)
return init_unaligned();
 }
 
+static int full_unaligned_test_setup(void)
+{
+   unaligned.start = 0;
+   unaligned.end = FBARRAY_UNALIGNED_TEST_LEN - 1;
+   return init_unaligned();
+}
+
 static int test_invalid(void)
 {
struct rte_fbarray dummy;
@@ -786,7 +817,7 @@ static int test_empty(void)
return TEST_SUCCESS;
 }
 
-static int test_lookahead(void)
+static int test_cross_msk(void)
 {
int ret;
 
@@ -801,7 +832,7 @@ static int test_lookahead(void)
return TEST_SUCCESS;
 }
 
-static int test_lookbehind(void)
+static int test_cross_rev_msk(void)
 {
int ret, free_len = 2;
 
@@ -816,19 +847,19 @@ static int test_lookbehind(void)
return TEST_SUCCESS;
 }
 
-static int test_lookahead_mask(void)
+static int test_broken_run(void)
 {
/*
-* There is a certain type of lookahead behavior we want to test here,
-* namely masking of bits that were scanned with lookahead but that we
-* know do not match our criteria. This is achieved in following steps:
+* There is a certain type of search behavior we want to test here,
+* namely starting cross-mask runs and failing to find them. This is
+* achieved when these conditions happen:
 *
 *   0. Look for a big enough chunk of free space (say, 62 elements)
-*   1. Trigger lookahead by breaking a run somewhere inside mask 0
-*  (indices 0-63)
-*   2. Fail lookahead by breaking the run somewhere inside mask 1
-*  (indices 64-127)
-*   3. Ensure that we can still find free space in mask 1 afterwards
+*   1. Break a run somewhere inside mask 0 (indices 0-63) but leave
+*  some free elements at the end of mask 0 to start a run
+*   2. Break the run somewhere inside mask 1 (indices 64-127)
+*   3. Ensure that we can still find a free space run right after the
+*  second broken run
 */
 
/* break run on first mask */
@@ -842,19 +873,19 @@ static int test_lookahead_mask(void)
return TEST_SUCCESS;
 }
 
-static int test_lookbehind_mask(void)
+static int test_rev_broken_run(void)
 {
/*
-* There is a certain type of lookbehind behavior we want to test here,
-* namely masking of bits that were scanned with lookbehind but that we
-* know do not match our criteria. This is achieved in two steps:
+* There is a certain type

[PATCH v1 2/3] fbarray: rework find_next_n to flatten the loop

2024-08-23 Thread Anatoly Burakov

Currently, find_next_n() is implemented as a nested loop due to lookahead
functionality. This is not very efficient because when doing lookahead,
we essentially scan some of the indices twice, and in general the lookahead
functionality has been a source of bugs because it is overcomplicated.
The bit ignore feature on lookahead is also unnecessary because we don't really
win anything by ignoring bits we have already scanned, as they would not trigger
any matches anyway.

This patch reworks find_next_n() to flatten the loop and remove the lookahead
and bit-ignore functionality, instead replacing it with state machine-like
behavior. This makes the code simpler to reason about.

Signed-off-by: Anatoly Burakov 
---
 lib/eal/common/eal_common_fbarray.c | 195 +---
 1 file changed, 93 insertions(+), 102 deletions(-)

diff --git a/lib/eal/common/eal_common_fbarray.c 
b/lib/eal/common/eal_common_fbarray.c
index 22b43073c6..d38a43d8d1 100644
--- a/lib/eal/common/eal_common_fbarray.c
+++ b/lib/eal/common/eal_common_fbarray.c
@@ -117,9 +117,11 @@ find_next_n(const struct rte_fbarray *arr, unsigned int 
start, unsigned int n,
 {
const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
arr->len);
-   unsigned int msk_idx, lookahead_idx, first, first_mod;
+   unsigned int msk_idx, first, first_mod;
unsigned int last, last_mod;
-   uint64_t last_msk, ignore_msk;
+   uint64_t last_msk, first_msk;
+   unsigned int run_start, left = 0;
+   bool run_started = false;
 
/*
 * mask only has granularity of MASK_ALIGN, but start may not be aligned
@@ -128,7 +130,7 @@ find_next_n(const struct rte_fbarray *arr, unsigned int 
start, unsigned int n,
 */
first = MASK_LEN_TO_IDX(start);
first_mod = MASK_LEN_TO_MOD(start);
-   ignore_msk = ~((1ULL << first_mod) - 1);
+   first_msk = ~((1ULL << first_mod) - 1);
 
/* array length may not be aligned, so calculate ignore mask for last
 * mask index.
@@ -137,131 +139,120 @@ find_next_n(const struct rte_fbarray *arr, unsigned int 
start, unsigned int n,
last_mod = MASK_LEN_TO_MOD(arr->len);
last_msk = ~(UINT64_MAX << last_mod);
 
+   left = n;
+
for (msk_idx = first; msk_idx < msk->n_masks; msk_idx++) {
-   uint64_t cur_msk, lookahead_msk;
-   unsigned int run_start, clz, left;
-   bool found = false;
+   unsigned int s_idx, clz, need;
+   uint64_t cur_msk, tmp_msk;
+
/*
-* The process of getting n consecutive bits for arbitrary n is
-* a bit involved, but here it is in a nutshell:
+* In order to find N consecutive bits for arbitrary N, we need
+* to be aware of the following:
 *
-*  1. let n be the number of consecutive bits we're looking for
-*  2. check if n can fit in one mask, and if so, do n-1
-* rshift-ands to see if there is an appropriate run inside
-* our current mask
-*2a. if we found a run, bail out early
-*2b. if we didn't find a run, proceed
-*  3. invert the mask and count leading zeroes (that is, count
-* how many consecutive set bits we had starting from the
-* end of current mask) as k
-*3a. if k is 0, continue to next mask
-*3b. if k is not 0, we have a potential run
-*  4. to satisfy our requirements, next mask must have n-k
-* consecutive set bits right at the start, so we will do
-* (n-k-1) rshift-ands and check if first bit is set.
+*  1. To find N number of consecutive bits within a mask, we
+* need to do N-1 rshift-ands and see if we still have set
+* bits anywhere in the mask
+*  2. N may be larger than mask size, in which case we need to
+* do a search in multiple consecutive masks
+*  3. For multi-mask search to be meaningful, we need to anchor
+* our searches, i.e. first we find a run of M bits at the
+* end of current mask, then we look for N-M bits at the
+* beginning of next mask (or multiple masks)
 *
-* Step 4 will need to be repeated if (n-k) > MASK_ALIGN until
-* we either run out of masks, lose the run, or find what we
-* were looking for.
+* With all of the above, the algorihm looks as follows:
+*
+*  1. let N be the number of consecutive bits we're looking for
+*  2. if we already started a run, check

[PATCH v1 3/3] fbarray: rework find_prev_n to flatten the loop

2024-08-23 Thread Anatoly Burakov

Currently, find_prev_n() is implemented as a nested loop due to lookbehind
functionality. This is not very efficient because when doing lookbehind, we
essentially scan some of the indices twice, and in general the lookbehind
functionality has been a source of bugs because it is overcomplicated. The bit
ignore feature on lookbehind is also unnecessary because we don't really win
anything by ignoring bits we have already scanned, as they would not trigger any
matches anyway.

This patch reworks find_prev_n() to flatten the loop and remove the lookbehind
and bit-ignore functionality, instead replacing it with state machine-like
behavior. This makes the code simpler to reason about.

Signed-off-by: Anatoly Burakov 
---
 lib/eal/common/eal_common_fbarray.c | 223 +---
 1 file changed, 101 insertions(+), 122 deletions(-)

diff --git a/lib/eal/common/eal_common_fbarray.c 
b/lib/eal/common/eal_common_fbarray.c
index d38a43d8d1..a2a19af14a 100644
--- a/lib/eal/common/eal_common_fbarray.c
+++ b/lib/eal/common/eal_common_fbarray.c
@@ -382,164 +382,143 @@ find_prev_n(const struct rte_fbarray *arr, unsigned int 
start, unsigned int n,
 {
const struct used_mask *msk = get_used_mask(arr->data, arr->elt_sz,
arr->len);
-   unsigned int msk_idx, lookbehind_idx, first, first_mod;
-   uint64_t ignore_msk;
+   /* we're going backwards so we need negative space */
+   int64_t msk_idx;
+   unsigned int first, first_mod;
+   uint64_t first_msk;
+   unsigned int run_end, left;
+   bool run_started = false;
 
/*
 * mask only has granularity of MASK_ALIGN, but start may not be aligned
 * on that boundary, so construct a special mask to exclude anything we
-* don't want to see to avoid confusing ctz.
+* don't want to see to avoid confusing clz. this "first" mask is
+* actually our last because we're going backwards, so no second mask
+* is required like in find_next_n case.
 */
first = MASK_LEN_TO_IDX(start);
first_mod = MASK_LEN_TO_MOD(start);
/* we're going backwards, so mask must start from the top */
-   ignore_msk = first_mod == MASK_ALIGN - 1 ?
+   first_msk = first_mod == MASK_ALIGN - 1 ?
UINT64_MAX : /* prevent overflow */
~(UINT64_MAX << (first_mod + 1));
 
+   left = n;
+
/* go backwards, include zero */
-   msk_idx = first;
-   do {
-   uint64_t cur_msk, lookbehind_msk;
-   unsigned int run_start, run_end, ctz, left;
-   bool found = false;
+   for (msk_idx = first; msk_idx >= 0; msk_idx--) {
+   unsigned int s_idx, ctz, need;
+   uint64_t cur_msk, tmp_msk;
+
/*
-* The process of getting n consecutive bits from the top for
-* arbitrary n is a bit involved, but here it is in a nutshell:
+* In order to find N consecutive bits for arbitrary N, we need
+* to be aware of the following:
 *
-*  1. let n be the number of consecutive bits we're looking for
-*  2. check if n can fit in one mask, and if so, do n-1
-* lshift-ands to see if there is an appropriate run inside
-* our current mask
-*2a. if we found a run, bail out early
-*2b. if we didn't find a run, proceed
-*  3. invert the mask and count trailing zeroes (that is, count
-* how many consecutive set bits we had starting from the
-* start of current mask) as k
-*3a. if k is 0, continue to next mask
-*3b. if k is not 0, we have a potential run
-*  4. to satisfy our requirements, next mask must have n-k
-* consecutive set bits at the end, so we will do (n-k-1)
-* lshift-ands and check if last bit is set.
+*  1. To find N number of consecutive bits within a mask, we
+* need to do N-1 lshift-ands and see if we still have set
+* bits anywhere in the mask
+*  2. N may be larger than mask size, in which case we need to
+* do a search in multiple consecutive masks
+*  3. For multi-mask search to be meaningful, we need to anchor
+* our searches, i.e. first we find a run of M bits at the
+* beginning of current mask, then we look for N-M bits at
+* the end of previous mask (or multiple masks)
 *
-* Step 4 will need to be repeated if (n-k) > MASK_ALIGN until
-* we either run out of masks, lose the run, or find what we

[PATCH v1 01/15] net/ixgbe/base: remove minsrevs code from DPDK

2024-08-29 Thread Anatoly Burakov

Commit add44414762c ("net/ixgbe/base: add E610 NVM-related operations") has
added code to read minimum security revision from NVM. This code was not
meant to be included in DPDK, and was only meant for other drivers derived
from shared base code, but was present due to the way shared driver code
snapshot was generated. Remove this code from DPDK driver.

Fixes: add44414762c ("net/ixgbe/base: add E610 NVM-related operations")

Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_e610.c | 54 -
 drivers/net/ixgbe/base/ixgbe_e610.h |  1 -
 2 files changed, 55 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c 
b/drivers/net/ixgbe/base/ixgbe_e610.c
index ac71980630..3e2be07731 100644
--- a/drivers/net/ixgbe/base/ixgbe_e610.c
+++ b/drivers/net/ixgbe/base/ixgbe_e610.c
@@ -2488,60 +2488,6 @@ static s32 ixgbe_read_nvm_sr_copy(struct ixgbe_hw *hw,
return ixgbe_read_nvm_module(hw, bank, hdr_len + offset, data);
 }
 
-/**
- * ixgbe_get_nvm_minsrevs - Get the minsrevs values from flash
- * @hw: pointer to the HW struct
- * @minsrevs: structure to store NVM and OROM minsrev values
- *
- * Read the Minimum Security Revision TLV and extract
- * the revision values from the flash image
- * into a readable structure for processing.
- *
- * Return: the exit code of the operation.
- */
-s32 ixgbe_get_nvm_minsrevs(struct ixgbe_hw *hw,
-  struct ixgbe_minsrev_info *minsrevs)
-{
-   struct ixgbe_aci_cmd_nvm_minsrev data;
-   s32 status;
-   u16 valid;
-
-   status = ixgbe_acquire_nvm(hw, IXGBE_RES_READ);
-   if (status)
-   return status;
-
-   status = ixgbe_aci_read_nvm(hw, IXGBE_ACI_NVM_MINSREV_MOD_ID,
-   0, sizeof(data), &data,
-   true, false);
-
-   ixgbe_release_nvm(hw);
-
-   if (status)
-   return status;
-
-   valid = IXGBE_LE16_TO_CPU(data.validity);
-
-   /* Extract NVM minimum security revision */
-   if (valid & IXGBE_ACI_NVM_MINSREV_NVM_VALID) {
-   u16 minsrev_l = IXGBE_LE16_TO_CPU(data.nvm_minsrev_l);
-   u16 minsrev_h = IXGBE_LE16_TO_CPU(data.nvm_minsrev_h);
-
-   minsrevs->nvm = minsrev_h << 16 | minsrev_l;
-   minsrevs->nvm_valid = true;
-   }
-
-   /* Extract the OROM minimum security revision */
-   if (valid & IXGBE_ACI_NVM_MINSREV_OROM_VALID) {
-   u16 minsrev_l = IXGBE_LE16_TO_CPU(data.orom_minsrev_l);
-   u16 minsrev_h = IXGBE_LE16_TO_CPU(data.orom_minsrev_h);
-
-   minsrevs->orom = minsrev_h << 16 | minsrev_l;
-   minsrevs->orom_valid = true;
-   }
-
-   return IXGBE_SUCCESS;
-}
-
 /**
  * ixgbe_get_nvm_srev - Read the security revision from the NVM CSS header
  * @hw: pointer to the HW struct
diff --git a/drivers/net/ixgbe/base/ixgbe_e610.h 
b/drivers/net/ixgbe/base/ixgbe_e610.h
index 33c683d1c1..4babee821e 100644
--- a/drivers/net/ixgbe/base/ixgbe_e610.h
+++ b/drivers/net/ixgbe/base/ixgbe_e610.h
@@ -85,7 +85,6 @@ s32 ixgbe_aci_read_nvm(struct ixgbe_hw *hw, u16 
module_typeid, u32 offset,
 s32 ixgbe_nvm_validate_checksum(struct ixgbe_hw *hw);
 s32 ixgbe_nvm_recalculate_checksum(struct ixgbe_hw *hw);
 
-s32 ixgbe_get_nvm_minsrevs(struct ixgbe_hw *hw, struct ixgbe_minsrev_info 
*minsrevs);
 s32 ixgbe_get_inactive_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info 
*nvm);
 s32 ixgbe_get_active_nvm_ver(struct ixgbe_hw *hw, struct ixgbe_nvm_info *nvm);
 s32 ixgbe_init_nvm(struct ixgbe_hw *hw);
-- 
2.43.5

[PATCH v1 02/15] net/ixgbe/base: add missing ACI definitions

2024-08-29 Thread Anatoly Burakov

When adding Admin Command Interface and E610 device support, some ACI
capabilities definition code was missed due to the way shared driver code
snapshot was generated. Add missing code paths.

Fixes: 25b48e569f2f ("net/ixgbe/base: add E610 Admin Command Interface")
Fixes: 7c3bfffda43d ("net/ixgbe/base: detect E610 device capabilities")

Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_e610.c  | 49 
 drivers/net/ixgbe/base/ixgbe_type_e610.h |  4 ++
 2 files changed, 53 insertions(+)

diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c 
b/drivers/net/ixgbe/base/ixgbe_e610.c
index 3e2be07731..1f3a4532da 100644
--- a/drivers/net/ixgbe/base/ixgbe_e610.c
+++ b/drivers/net/ixgbe/base/ixgbe_e610.c
@@ -671,6 +671,9 @@ ixgbe_parse_common_caps(struct ixgbe_hw *hw, struct 
ixgbe_hw_common_caps *caps,
case IXGBE_ACI_CAPS_VALID_FUNCTIONS:
caps->valid_functions = number;
break;
+   case IXGBE_ACI_CAPS_SRIOV:
+   caps->sr_iov_1_1 = (number == 1);
+   break;
case IXGBE_ACI_CAPS_VMDQ:
caps->vmdq = (number == 1);
break;
@@ -833,6 +836,25 @@ ixgbe_parse_valid_functions_cap(struct ixgbe_hw *hw,
hw->logical_pf_id = ixgbe_func_id_to_logical_id(number, hw->pf_id);
 }
 
+/**
+ * ixgbe_parse_vf_dev_caps - Parse IXGBE_ACI_CAPS_VF device caps
+ * @hw: pointer to the HW struct
+ * @dev_p: pointer to device capabilities structure
+ * @cap: capability element to parse
+ *
+ * Parse IXGBE_ACI_CAPS_VF for device capabilities.
+ */
+static void ixgbe_parse_vf_dev_caps(struct ixgbe_hw *hw,
+   struct ixgbe_hw_dev_caps *dev_p,
+   struct ixgbe_aci_cmd_list_caps_elem *cap)
+{
+   u32 number = IXGBE_LE32_TO_CPU(cap->number);
+
+   UNREFERENCED_1PARAMETER(hw);
+
+   dev_p->num_vfs_exposed = number;
+}
+
 /**
  * ixgbe_parse_vsi_dev_caps - Parse IXGBE_ACI_CAPS_VSI device caps
  * @hw: pointer to the HW struct
@@ -944,6 +966,9 @@ static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw,
ixgbe_parse_valid_functions_cap(hw, dev_p,
&cap_resp[i]);
break;
+   case IXGBE_ACI_CAPS_VF:
+   ixgbe_parse_vf_dev_caps(hw, dev_p, &cap_resp[i]);
+   break;
case IXGBE_ACI_CAPS_VSI:
ixgbe_parse_vsi_dev_caps(hw, dev_p, &cap_resp[i]);
break;
@@ -962,6 +987,27 @@ static void ixgbe_parse_dev_caps(struct ixgbe_hw *hw,
 
 }
 
+/**
+ * ixgbe_parse_vf_func_caps - Parse IXGBE_ACI_CAPS_VF function caps
+ * @hw: pointer to the HW struct
+ * @func_p: pointer to function capabilities structure
+ * @cap: pointer to the capability element to parse
+ *
+ * Extract function capabilities for IXGBE_ACI_CAPS_VF.
+ */
+static void ixgbe_parse_vf_func_caps(struct ixgbe_hw *hw,
+struct ixgbe_hw_func_caps *func_p,
+struct ixgbe_aci_cmd_list_caps_elem *cap)
+{
+   u32 logical_id = IXGBE_LE32_TO_CPU(cap->logical_id);
+   u32 number = IXGBE_LE32_TO_CPU(cap->number);
+
+   UNREFERENCED_1PARAMETER(hw);
+
+   func_p->num_allocd_vfs = number;
+   func_p->vf_base_id = logical_id;
+}
+
 /**
  * ixgbe_get_num_per_func - determine number of resources per PF
  * @hw: pointer to the HW structure
@@ -1073,6 +1119,9 @@ static void ixgbe_parse_func_caps(struct ixgbe_hw *hw,
&cap_resp[i], "func caps");
 
switch (cap) {
+   case IXGBE_ACI_CAPS_VF:
+   ixgbe_parse_vf_func_caps(hw, func_p, &cap_resp[i]);
+   break;
case IXGBE_ACI_CAPS_VSI:
ixgbe_parse_vsi_func_caps(hw, func_p, &cap_resp[i]);
break;
diff --git a/drivers/net/ixgbe/base/ixgbe_type_e610.h 
b/drivers/net/ixgbe/base/ixgbe_type_e610.h
index 9e72053e2a..dcb874e42e 100644
--- a/drivers/net/ixgbe/base/ixgbe_type_e610.h
+++ b/drivers/net/ixgbe/base/ixgbe_type_e610.h
@@ -672,6 +672,8 @@ struct ixgbe_aci_cmd_list_caps_elem {
__le16 cap;
 #define IXGBE_ACI_CAPS_VALID_FUNCTIONS 0x0005
 #define IXGBE_ACI_MAX_VALID_FUNCTIONS  0x8
+#define IXGBE_ACI_CAPS_SRIOV   0x0012
+#define IXGBE_ACI_CAPS_VF  0x0013
 #define IXGBE_ACI_CAPS_VMDQ0x0014
 #define IXGBE_ACI_CAPS_VSI 0x0017
 #define IXGBE_ACI_CAPS_DCB 0x0018
@@ -1954,6 +1956,8 @@ struct ixgbe_hw_common_caps {
 #define IXGBE_MAX_SUPPORTED_GPIO_SDP   8
u8 led[IXGBE_MAX_SUPPORTED_GPIO_LED];
u8 sdp[IXGBE_MAX_SUPPORTED_GPIO_SDP];
+

[PATCH v1 03/15] net/ixgbe/base: add missing E610 definitions

2024-08-29 Thread Anatoly Burakov

When adding support for E610 bringup, some definitions and code paths were
accidentally omitted due to the way the shared driver snapshot was created.
Add missing definitions and code paths.

Fixes: 316637762a5f ("net/ixgbe/base: enable E610 device")

Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_api.c   | 5 +
 drivers/net/ixgbe/base/ixgbe_type.h  | 2 ++
 drivers/net/ixgbe/base/ixgbe_type_e610.h | 3 +++
 3 files changed, 10 insertions(+)

diff --git a/drivers/net/ixgbe/base/ixgbe_api.c 
b/drivers/net/ixgbe/base/ixgbe_api.c
index c8f9a6d9f1..b4920867bc 100644
--- a/drivers/net/ixgbe/base/ixgbe_api.c
+++ b/drivers/net/ixgbe/base/ixgbe_api.c
@@ -87,6 +87,7 @@ s32 ixgbe_init_shared_code(struct ixgbe_hw *hw)
case ixgbe_mac_X550_vf:
case ixgbe_mac_X550EM_x_vf:
case ixgbe_mac_X550EM_a_vf:
+   case ixgbe_mac_E610_vf:
status = ixgbe_init_ops_vf(hw);
break;
case ixgbe_mac_E610:
@@ -219,6 +220,10 @@ s32 ixgbe_set_mac_type(struct ixgbe_hw *hw)
hw->mac.type = ixgbe_mac_E610;
hw->mvals = ixgbe_mvals_X550EM_a;
break;
+   case IXGBE_DEV_ID_E610_VF:
+   hw->mac.type = ixgbe_mac_E610_vf;
+   hw->mvals = ixgbe_mvals_X550EM_a;
+   break;
default:
ret_val = IXGBE_ERR_DEVICE_NOT_SUPPORTED;
ERROR_REPORT2(IXGBE_ERROR_UNSUPPORTED,
diff --git a/drivers/net/ixgbe/base/ixgbe_type.h 
b/drivers/net/ixgbe/base/ixgbe_type.h
index d86049426e..f6d5052c65 100644
--- a/drivers/net/ixgbe/base/ixgbe_type.h
+++ b/drivers/net/ixgbe/base/ixgbe_type.h
@@ -130,6 +130,7 @@
 #define IXGBE_DEV_ID_E610_10G_T0x57B0
 #define IXGBE_DEV_ID_E610_2_5G_T   0x57B1
 #define IXGBE_DEV_ID_E610_SGMII0x57B2
+#define IXGBE_DEV_ID_E610_VF   0x57AD
 
 #define IXGBE_CAT(r, m) IXGBE_##r##m
 
@@ -3676,6 +3677,7 @@ enum ixgbe_mac_type {
ixgbe_mac_X550EM_x_vf,
ixgbe_mac_X550EM_a_vf,
ixgbe_mac_E610,
+   ixgbe_mac_E610_vf,
ixgbe_num_macs
 };
 
diff --git a/drivers/net/ixgbe/base/ixgbe_type_e610.h 
b/drivers/net/ixgbe/base/ixgbe_type_e610.h
index dcb874e42e..ab57852f19 100644
--- a/drivers/net/ixgbe/base/ixgbe_type_e610.h
+++ b/drivers/net/ixgbe/base/ixgbe_type_e610.h
@@ -2080,6 +2080,8 @@ struct ixgbe_orom_civd_info {
 /* Function specific capabilities */
 struct ixgbe_hw_func_caps {
struct ixgbe_hw_common_caps common_cap;
+   u32 num_allocd_vfs; /* Number of allocated VFs */
+   u32 vf_base_id; /* Logical ID of the first VF */
u32 guar_num_vsi;
struct ixgbe_ts_func_info ts_func_info;
bool no_drop_policy_ena;
@@ -2088,6 +2090,7 @@ struct ixgbe_hw_func_caps {
 /* Device wide capabilities */
 struct ixgbe_hw_dev_caps {
struct ixgbe_hw_common_caps common_cap;
+   u32 num_vfs_exposed;/* Total number of VFs exposed */
u32 num_vsi_allocd_to_host; /* Excluding EMP VSI */
u32 num_flow_director_fltr; /* Number of FD filters available */
struct ixgbe_ts_dev_info ts_dev_info;
-- 
2.43.5

[PATCH v1 04/15] net/ixgbe/base: add missing legacy mailbox API

2024-08-29 Thread Anatoly Burakov

When the new mailbox API was introduced, the legacy mailbox API was also
provided, but was missing from the patches due to the way the patches were
generated. This patch adds the missing legacy mailbox API to the driver.

Fixes: 6d243d2caf2c ("net/ixgbe/base: introduce new mailbox API")

Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_mbx.c | 44 ++
 drivers/net/ixgbe/base/ixgbe_mbx.h |  1 +
 2 files changed, 45 insertions(+)

diff --git a/drivers/net/ixgbe/base/ixgbe_mbx.c 
b/drivers/net/ixgbe/base/ixgbe_mbx.c
index 444a0d339d..23659266d0 100644
--- a/drivers/net/ixgbe/base/ixgbe_mbx.c
+++ b/drivers/net/ixgbe/base/ixgbe_mbx.c
@@ -169,6 +169,26 @@ s32 ixgbe_check_for_rst(struct ixgbe_hw *hw, u16 mbx_id)
return ret_val;
 }
 
+/**
+ * ixgbe_clear_mbx - Clear Mailbox Memory
+ * @hw: pointer to the HW structure
+ * @mbx_id: id of mailbox to write
+ *
+ * Set VFMBMEM of given VF to 0x0.
+ **/
+s32 ixgbe_clear_mbx(struct ixgbe_hw *hw, u16 mbx_id)
+{
+   struct ixgbe_mbx_info *mbx = &hw->mbx;
+   s32 ret_val = IXGBE_ERR_CONFIG;
+
+   DEBUGFUNC("ixgbe_clear_mbx");
+
+   if (mbx->ops[mbx_id].clear)
+   ret_val = mbx->ops[mbx_id].clear(hw, mbx_id);
+
+   return ret_val;
+}
+
 /**
  * ixgbe_poll_for_msg - Wait for message notification
  * @hw: pointer to the HW structure
@@ -628,6 +648,7 @@ void ixgbe_init_mbx_params_vf(struct ixgbe_hw *hw)
mbx->ops[0].check_for_msg = ixgbe_check_for_msg_vf;
mbx->ops[0].check_for_ack = ixgbe_check_for_ack_vf;
mbx->ops[0].check_for_rst = ixgbe_check_for_rst_vf;
+   mbx->ops[0].clear = NULL;
 
mbx->stats.msgs_tx = 0;
mbx->stats.msgs_rx = 0;
@@ -1024,6 +1045,27 @@ STATIC s32 ixgbe_read_mbx_pf(struct ixgbe_hw *hw, u32 
*msg, u16 size,
return IXGBE_SUCCESS;
 }
 
+/**
+ * ixgbe_clear_mbx_pf - Clear Mailbox Memory
+ * @hw: pointer to the HW structure
+ * @vf_id: the VF index
+ *
+ * Set VFMBMEM of given VF to 0x0.
+ **/
+STATIC s32 ixgbe_clear_mbx_pf(struct ixgbe_hw *hw, u16 vf_id)
+{
+   u16 mbx_size = hw->mbx.size;
+   u16 i;
+
+   if (vf_id > 63)
+   return IXGBE_ERR_PARAM;
+
+   for (i = 0; i < mbx_size; ++i)
+   IXGBE_WRITE_REG_ARRAY(hw, IXGBE_PFMBMEM(vf_id), i, 0x0);
+
+   return IXGBE_SUCCESS;
+}
+
 /**
  * ixgbe_init_mbx_params_pf_id - set initial values for pf mailbox
  * @hw: pointer to the HW structure
@@ -1042,6 +1084,7 @@ void ixgbe_init_mbx_params_pf_id(struct ixgbe_hw *hw, u16 
vf_id)
mbx->ops[vf_id].check_for_msg = ixgbe_check_for_msg_pf;
mbx->ops[vf_id].check_for_ack = ixgbe_check_for_ack_pf;
mbx->ops[vf_id].check_for_rst = ixgbe_check_for_rst_pf;
+   mbx->ops[vf_id].clear = ixgbe_clear_mbx_pf;
 }
 
 /**
@@ -1119,6 +1162,7 @@ void ixgbe_upgrade_mbx_params_pf(struct ixgbe_hw *hw, u16 
vf_id)
mbx->ops[vf_id].check_for_msg = ixgbe_check_for_msg_pf;
mbx->ops[vf_id].check_for_ack = ixgbe_check_for_ack_pf;
mbx->ops[vf_id].check_for_rst = ixgbe_check_for_rst_pf;
+   mbx->ops[vf_id].clear = ixgbe_clear_mbx_pf;
 
mbx->stats.msgs_tx = 0;
mbx->stats.msgs_rx = 0;
diff --git a/drivers/net/ixgbe/base/ixgbe_mbx.h 
b/drivers/net/ixgbe/base/ixgbe_mbx.h
index 56ab435286..434f7c6a69 100644
--- a/drivers/net/ixgbe/base/ixgbe_mbx.h
+++ b/drivers/net/ixgbe/base/ixgbe_mbx.h
@@ -168,6 +168,7 @@ s32 ixgbe_write_mbx(struct ixgbe_hw *hw, u32 *msg, u16 
size, u16 mbx_id);
 s32 ixgbe_check_for_msg(struct ixgbe_hw *hw, u16 mbx_id);
 s32 ixgbe_check_for_ack(struct ixgbe_hw *hw, u16 mbx_id);
 s32 ixgbe_check_for_rst(struct ixgbe_hw *hw, u16 mbx_id);
+s32 ixgbe_clear_mbx(struct ixgbe_hw *hw, u16 vf_number);
 void ixgbe_init_mbx_params_vf(struct ixgbe_hw *hw);
 void ixgbe_upgrade_mbx_params_vf(struct ixgbe_hw *hw);
 void ixgbe_init_mbx_params_pf(struct ixgbe_hw *hw);
-- 
2.43.5

[PATCH v1 05/15] net/ixgbe/base: add E610 VF HV macro

2024-08-29 Thread Anatoly Burakov

From: Jedrzej Jagielski 

At this point there is no macro specific for E610 VF HV.
Add it to ixgbe_type.h

Signed-off-by: Jedrzej Jagielski 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_type.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ixgbe/base/ixgbe_type.h 
b/drivers/net/ixgbe/base/ixgbe_type.h
index f6d5052c65..cc49eace91 100644
--- a/drivers/net/ixgbe/base/ixgbe_type.h
+++ b/drivers/net/ixgbe/base/ixgbe_type.h
@@ -131,6 +131,7 @@
 #define IXGBE_DEV_ID_E610_2_5G_T   0x57B1
 #define IXGBE_DEV_ID_E610_SGMII0x57B2
 #define IXGBE_DEV_ID_E610_VF   0x57AD
+#define IXGBE_SUBDEV_ID_E610_VF_HV 0x0001
 
 #define IXGBE_CAT(r, m) IXGBE_##r##m
 
-- 
2.43.5

[PATCH v1 06/15] net/ixgbe/base: fix unchecked return value

2024-08-29 Thread Anatoly Burakov

From: Barbara Skobiej 

There was unchecked return value in the ixgbe_stop_mac_link_on_d3_82599
function. Added checking of return value from the called function
ixgbe_read_eeprom.

Signed-off-by: Barbara Skobiej 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_82599.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_82599.c 
b/drivers/net/ixgbe/base/ixgbe_82599.c
index c4ad906f0f..3110477700 100644
--- a/drivers/net/ixgbe/base/ixgbe_82599.c
+++ b/drivers/net/ixgbe/base/ixgbe_82599.c
@@ -556,13 +556,15 @@ enum ixgbe_media_type ixgbe_get_media_type_82599(struct 
ixgbe_hw *hw)
  **/
 void ixgbe_stop_mac_link_on_d3_82599(struct ixgbe_hw *hw)
 {
-   u32 autoc2_reg;
u16 ee_ctrl_2 = 0;
+   u32 autoc2_reg;
+   u32 status;
 
DEBUGFUNC("ixgbe_stop_mac_link_on_d3_82599");
-   ixgbe_read_eeprom(hw, IXGBE_EEPROM_CTRL_2, &ee_ctrl_2);
+   status = ixgbe_read_eeprom(hw, IXGBE_EEPROM_CTRL_2, &ee_ctrl_2);
 
-   if (!ixgbe_mng_present(hw) && !hw->wol_enabled &&
+   if (status == IXGBE_SUCCESS &&
+   !ixgbe_mng_present(hw) && !hw->wol_enabled &&
ee_ctrl_2 & IXGBE_EEPROM_CCD_BIT) {
autoc2_reg = IXGBE_READ_REG(hw, IXGBE_AUTOC2);
autoc2_reg |= IXGBE_AUTOC2_LINK_DISABLE_ON_D3_MASK;
-- 
2.43.5

[PATCH v1 07/15] net/ixgbe/base: fix media type handling for E610

2024-08-29 Thread Anatoly Burakov

From: Krzysztof Galazka 

Media type information should not be updated by ixgbe_aci_get_link_info
function because it will be incorrectly set as unknown when link is down.
Do it only in ixgbe_get_media_type_E610.

Signed-off-by: Krzysztof Galazka 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_e610.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c 
b/drivers/net/ixgbe/base/ixgbe_e610.c
index 1f3a4532da..b9b1ba32c3 100644
--- a/drivers/net/ixgbe/base/ixgbe_e610.c
+++ b/drivers/net/ixgbe/base/ixgbe_e610.c
@@ -1683,7 +1683,6 @@ s32 ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool 
ena_lse,
struct ixgbe_aci_cmd_get_link_status *resp;
struct ixgbe_link_status *li_old, *li;
struct ixgbe_fc_info *hw_fc_info;
-   enum ixgbe_media_type *hw_media_type;
struct ixgbe_aci_desc desc;
bool tx_pause, rx_pause;
u8 cmd_flags;
@@ -1693,7 +1692,6 @@ s32 ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool 
ena_lse,
return IXGBE_ERR_PARAM;
 
li_old = &hw->link.link_info_old;
-   hw_media_type = &hw->phy.media_type;
li = &hw->link.link_info;
hw_fc_info = &hw->fc;
 
@@ -1714,7 +1712,6 @@ s32 ixgbe_aci_get_link_info(struct ixgbe_hw *hw, bool 
ena_lse,
li->link_speed = IXGBE_LE16_TO_CPU(link_data.link_speed);
li->phy_type_low = IXGBE_LE64_TO_CPU(link_data.phy_type_low);
li->phy_type_high = IXGBE_LE64_TO_CPU(link_data.phy_type_high);
-   *hw_media_type = ixgbe_get_media_type_from_phy_type(hw);
li->link_info = link_data.link_info;
li->link_cfg_err = link_data.link_cfg_err;
li->an_info = link_data.an_info;
@@ -3664,10 +3661,11 @@ enum ixgbe_media_type ixgbe_get_media_type_E610(struct 
ixgbe_hw *hw)
}
}
 
-   /* Based on search above try to discover media type */
-   hw->phy.media_type = ixgbe_get_media_type_from_phy_type(hw);
}
 
+   /* Based on link status or search above try to discover media type */
+   hw->phy.media_type = ixgbe_get_media_type_from_phy_type(hw);
+
return hw->phy.media_type;
 }
 
-- 
2.43.5

[PATCH v1 08/15] net/ixgbe/base: fix speed autonegotiation on E610

2024-08-29 Thread Anatoly Burakov

From: Krzysztof Galazka 

When user changed advertised speed settings and link was already up
driver asked FW only for active PHY configuration. This prevented it from
adding speeds, which are supported but was earlier disabled by user. Get
all speeds supported by HW to allow user enabling any of them.

Signed-off-by: Krzysztof Galazka 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_e610.c | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c 
b/drivers/net/ixgbe/base/ixgbe_e610.c
index b9b1ba32c3..6eaf377f4a 100644
--- a/drivers/net/ixgbe/base/ixgbe_e610.c
+++ b/drivers/net/ixgbe/base/ixgbe_e610.c
@@ -4342,7 +4342,8 @@ s32 ixgbe_setup_phy_link_E610(struct ixgbe_hw *hw)
 {
struct ixgbe_aci_cmd_get_phy_caps_data pcaps;
struct ixgbe_aci_cmd_set_phy_cfg_data pcfg;
-   u8 rmode = IXGBE_ACI_REPORT_ACTIVE_CFG;
+   u8 rmode = IXGBE_ACI_REPORT_TOPO_CAP_MEDIA;
+   u64 sup_phy_type_low, sup_phy_type_high;
s32 rc;
 
rc = ixgbe_aci_get_link_info(hw, false, NULL);
@@ -4359,6 +4360,15 @@ s32 ixgbe_setup_phy_link_E610(struct ixgbe_hw *hw)
goto err;
}
 
+   sup_phy_type_low = pcaps.phy_type_low;
+   sup_phy_type_high = pcaps.phy_type_high;
+
+   /* Get Active configuration to avoid unintended changes */
+   rc = ixgbe_aci_get_phy_caps(hw, false, IXGBE_ACI_REPORT_ACTIVE_CFG,
+   &pcaps);
+   if (rc) {
+   goto err;
+   }
ixgbe_copy_phy_caps_to_cfg(&pcaps, &pcfg);
 
/* Set default PHY types for a given speed */
@@ -4406,8 +4416,8 @@ s32 ixgbe_setup_phy_link_E610(struct ixgbe_hw *hw)
}
 
/* Mask the set values to avoid requesting unsupported link types */
-   pcfg.phy_type_low &= pcaps.phy_type_low;
-   pcfg.phy_type_high &= pcaps.phy_type_high;
+   pcfg.phy_type_low &= sup_phy_type_low;
+   pcfg.phy_type_high &= sup_phy_type_high;
 
if (pcfg.phy_type_high != pcaps.phy_type_high ||
pcfg.phy_type_low != pcaps.phy_type_low ||
-- 
2.43.5

[PATCH v1 09/15] net/ixgbe/base: FW API version update

2024-08-29 Thread Anatoly Burakov

From: Pawel Malinowski 

Update FW API version to 1.7.

Signed-off-by: Pawel Malinowski 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_type_e610.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_type_e610.h 
b/drivers/net/ixgbe/base/ixgbe_type_e610.h
index ab57852f19..bad332c6b8 100644
--- a/drivers/net/ixgbe/base/ixgbe_type_e610.h
+++ b/drivers/net/ixgbe/base/ixgbe_type_e610.h
@@ -351,7 +351,7 @@
  */
 #define IXGBE_FW_API_VER_BRANCH0x00
 #define IXGBE_FW_API_VER_MAJOR 0x01
-#define IXGBE_FW_API_VER_MINOR 0x05
+#define IXGBE_FW_API_VER_MINOR 0x07
 #define IXGBE_FW_API_VER_DIFF_ALLOWED  0x02
 
 #define IXGBE_ACI_DESC_SIZE32
-- 
2.43.5

[PATCH v1 10/15] net/ixgbe/base: handle 5G link speed for E610

2024-08-29 Thread Anatoly Burakov

From: Piotr Kwapulinski 

When detecting the 5G link speed take into account the E610 VF MAC type in
ixgbe_check_mac_link_vf().

Signed-off-by: Piotr Kwapulinski 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_vf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_vf.c 
b/drivers/net/ixgbe/base/ixgbe_vf.c
index 0d5b29ba50..37556a9300 100644
--- a/drivers/net/ixgbe/base/ixgbe_vf.c
+++ b/drivers/net/ixgbe/base/ixgbe_vf.c
@@ -628,7 +628,8 @@ s32 ixgbe_check_mac_link_vf(struct ixgbe_hw *hw, 
ixgbe_link_speed *speed,
break;
case IXGBE_LINKS_SPEED_100_82599:
*speed = IXGBE_LINK_SPEED_100_FULL;
-   if (hw->mac.type == ixgbe_mac_X550_vf) {
+   if (hw->mac.type == ixgbe_mac_X550_vf ||
+   hw->mac.type == ixgbe_mac_E610_vf) {
if (links_reg & IXGBE_LINKS_SPEED_NON_STD)
*speed = IXGBE_LINK_SPEED_5GB_FULL;
}
-- 
2.43.5

[PATCH v1 11/15] net/ixgbe/base: remove FW API version check

2024-08-29 Thread Anatoly Burakov

From: Krzysztof Galazka 

Only certain variants of drivers rely on FW API version check in shared
code. Other drivers implement their own logic due to differences in
requirements. DPDK does not require the FW API check.

Signed-off-by: Krzysztof Galazka 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_e610.c | 31 +
 1 file changed, 1 insertion(+), 30 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c 
b/drivers/net/ixgbe/base/ixgbe_e610.c
index 6eaf377f4a..7ea495db97 100644
--- a/drivers/net/ixgbe/base/ixgbe_e610.c
+++ b/drivers/net/ixgbe/base/ixgbe_e610.c
@@ -3541,32 +3541,7 @@ s32 ixgbe_reset_hw_E610(struct ixgbe_hw *hw)
 reset_hw_out:
return status;
 }
-/**
- * ixgbe_fw_ver_check - Check the reported FW API version
- * @hw: pointer to the hardware structure
- *
- * Checks if the driver should load on a given FW API version.
- *
- * Return: 'true' if the driver should attempt to load. 'false' otherwise.
- */
-static bool ixgbe_fw_ver_check(struct ixgbe_hw *hw)
-{
-   if (hw->api_maj_ver > IXGBE_FW_API_VER_MAJOR) {
-   ERROR_REPORT1(IXGBE_ERROR_UNSUPPORTED, "The driver for the 
device stopped because the NVM image is newer than expected. You must install 
the most recent version of the network driver.\n");
-   return false;
-   } else if (hw->api_maj_ver == IXGBE_FW_API_VER_MAJOR) {
-   if (hw->api_min_ver >
-   (IXGBE_FW_API_VER_MINOR + IXGBE_FW_API_VER_DIFF_ALLOWED)) {
-   ERROR_REPORT1(IXGBE_ERROR_CAUTION, "The driver for the 
device detected a newer version of the NVM image than expected. Please install 
the most recent version of the network driver.\n");
-   } else if ((hw->api_min_ver + IXGBE_FW_API_VER_DIFF_ALLOWED) <
-  IXGBE_FW_API_VER_MINOR) {
-   ERROR_REPORT1(IXGBE_ERROR_CAUTION, "The driver for the 
device detected an older version of the NVM image than expected. Please update 
the NVM image.\n");
-   }
-   } else {
-   ERROR_REPORT1(IXGBE_ERROR_CAUTION, "The driver for the device 
detected an older version of the NVM image than expected. Please update the NVM 
image.\n");
-   }
-   return true;
-}
+
 /**
  * ixgbe_start_hw_E610 - Prepare hardware for Tx/Rx
  * @hw: pointer to hardware structure
@@ -3584,10 +3559,6 @@ s32 ixgbe_start_hw_E610(struct ixgbe_hw *hw)
if (ret_val)
goto out;
 
-   if (!ixgbe_fw_ver_check(hw)) {
-   ret_val = IXGBE_ERR_FW_API_VER;
-   goto out;
-   }
ret_val = ixgbe_start_hw_generic(hw);
if (ret_val != IXGBE_SUCCESS)
goto out;
-- 
2.43.5

[PATCH v1 12/15] net/ixgbe/base: disable thermal sensor ops for E610

2024-08-29 Thread Anatoly Burakov

From: Andrzej Wilczynski 

According to data sheet, E610 doesn't expose current reading from thermal
sensors. Currently, E610 sensor ops are the same as for X540, which will
include the unsupported op. This patch disables those ops for E610 to avoid
attempts to read those sensors.

Signed-off-by: Andrzej Wilczynski 
Co-authored-by: RemigiuszX Konca 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_e610.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/ixgbe/base/ixgbe_e610.c 
b/drivers/net/ixgbe/base/ixgbe_e610.c
index 7ea495db97..ab02b11d6a 100644
--- a/drivers/net/ixgbe/base/ixgbe_e610.c
+++ b/drivers/net/ixgbe/base/ixgbe_e610.c
@@ -3431,6 +3431,8 @@ s32 ixgbe_init_ops_E610(struct ixgbe_hw *hw)
mac->ops.get_fw_tsam_mode = ixgbe_get_fw_tsam_mode_E610;
mac->ops.get_fw_version = ixgbe_aci_get_fw_ver;
mac->ops.get_nvm_version = ixgbe_get_active_nvm_ver;
+   mac->ops.get_thermal_sensor_data = NULL;
+   mac->ops.init_thermal_sensor_thresh = NULL;
 
/* PHY */
phy->ops.init = ixgbe_init_phy_ops_E610;
-- 
2.43.5

[PATCH v1 13/15] net/ixgbe/base: fix mailbox ACK handling

2024-08-29 Thread Anatoly Burakov

From: NorbertX Ciosek 

Check if CTS bit is set in the mailbox message before waiting for ACK.
Otherwise ACK will never be received causing the function to timeout. Add
a note for ixgbe_write_mbx that it should be called while holding a lock.

Signed-off-by: NorbertX Ciosek 
Signed-off-by: Anatoly Burakov 
---
 drivers/net/ixgbe/base/ixgbe_mbx.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ixgbe/base/ixgbe_mbx.c 
b/drivers/net/ixgbe/base/ixgbe_mbx.c
index 23659266d0..fb8ea8ca68 100644
--- a/drivers/net/ixgbe/base/ixgbe_mbx.c
+++ b/drivers/net/ixgbe/base/ixgbe_mbx.c
@@ -82,6 +82,9 @@ s32 ixgbe_poll_mbx(struct ixgbe_hw *hw, u32 *msg, u16 size, 
u16 mbx_id)
  *
  * returns SUCCESS if it successfully copied message into the buffer and
  * received an ACK to that message within specified period
+ *
+ * Note that the caller to this function must lock before calling, since
+ * multiple threads can destroy each other messages.
  **/
 s32 ixgbe_write_mbx(struct ixgbe_hw *hw, u32 *msg, u16 size, u16 mbx_id)
 {
@@ -836,6 +839,11 @@ STATIC s32 ixgbe_obtain_mbx_lock_pf(struct ixgbe_hw *hw, 
u16 vf_id)
while (countdown--) {
/* Reserve mailbox for PF use */
pf_mailbox = IXGBE_READ_REG(hw, IXGBE_PFMAILBOX(vf_id));
+
+   /* Check if other thread holds the PF lock already */
+   if (pf_mailbox & IXGBE_PFMAILBOX_PFU)
+   goto retry;
+
pf_mailbox |= IXGBE_PFMAILBOX_PFU;
IXGBE_WRITE_REG(hw, IXGBE_PFMAILBOX(vf_id), pf_mailbox);
 
@@ -846,6 +854,7 @@ STATIC s32 ixgbe_obtain_mbx_lock_pf(struct ixgbe_hw *hw, 
u16 vf_id)
break;
}
 
+   retry:
/* Wait a bit before trying again */
usec_delay(mbx->usec_delay);
}
@@ -948,13 +957,14 @@ STATIC s32 ixgbe_write_mbx_pf(struct ixgbe_hw *hw, u32 
*msg, u16 size,
for (i = 0; i < size; i++)
IXGBE_WRITE_REG_ARRAY(hw, IXGBE_PFMBMEM(vf_id), i, msg[i]);
 
-   /* Interrupt VF to tell it a message has been sent */
+   /* interrupt VF to tell it a message has been sent */
pf_mailbox = IXGBE_READ_REG(hw, IXGBE_PFMAILBOX(vf_id));
pf_mailbox |= IXGBE_PFMAILBOX_STS;
IXGBE_WRITE_REG(hw, IXGBE_PFMAILBOX(vf_id), pf_mailbox);
 
/* if msg sent wait until we receive an ack */
-   ixgbe_poll_for_ack(hw, vf_id);
+   if (msg[0] & IXGBE_VT_MSGTYPE_CTS)
+   ixgbe_poll_for_ack(hw, vf_id);
 
/* update stats */
hw->mbx.stats.msgs_tx++;
-- 
2.43.5

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 3078 matches

Mail list logo