> 
> Now that we have everything in a C file, we can store the information
> about our sleep, and have a native mechanism to wake up the sleeping
> core. This mechanism would however only wake up a core that's sleeping
> while monitoring - waking up from `rte_power_pause` won't work.
> 
> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
> ---
> 
> Notes:
>     v13:
>     - Add comments around wakeup code to explain what it does
>     - Add lcore_id parameter checking to prevent buffer overrun
> 
>  .../arm/include/rte_power_intrinsics.h        |  9 ++
>  .../include/generic/rte_power_intrinsics.h    | 16 ++++
>  .../ppc/include/rte_power_intrinsics.h        |  9 ++
>  lib/librte_eal/version.map                    |  1 +
>  lib/librte_eal/x86/rte_power_intrinsics.c     | 85 +++++++++++++++++++
>  5 files changed, 120 insertions(+)
> 
> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h 
> b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> index 27869251a8..39e49cc45b 100644
> --- a/lib/librte_eal/arm/include/rte_power_intrinsics.h
> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>       RTE_SET_USED(tsc_timestamp);
>  }
> 
> +/**
> + * This function is not supported on ARM.
> + */
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> +     RTE_SET_USED(lcore_id);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h 
> b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> index a6f1955996..e311d6f8ea 100644
> --- a/lib/librte_eal/include/generic/rte_power_intrinsics.h
> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> @@ -57,6 +57,22 @@ __rte_experimental
>  void rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>               const uint64_t tsc_timestamp);
> 
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Wake up a specific lcore that is in a power optimized state and is 
> monitoring
> + * an address.
> + *
> + * @note This function will *not* wake up a core that is in a power optimized
> + *   state due to calling `rte_power_pause`.
> + *
> + * @param lcore_id
> + *   Lcore ID of a sleeping thread.
> + */
> +__rte_experimental
> +void rte_power_monitor_wakeup(const unsigned int lcore_id);
> +
>  /**
>   * @warning
>   * @b EXPERIMENTAL: this API may change without prior notice
> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h 
> b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> index 248d1f4a23..2e7db0e7eb 100644
> --- a/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> @@ -33,6 +33,15 @@ rte_power_pause(const uint64_t tsc_timestamp)
>       RTE_SET_USED(tsc_timestamp);
>  }
> 
> +/**
> + * This function is not supported on PPC64.
> + */
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> +     RTE_SET_USED(lcore_id);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/version.map b/lib/librte_eal/version.map
> index 20945b1efa..ac026e289d 100644
> --- a/lib/librte_eal/version.map
> +++ b/lib/librte_eal/version.map
> @@ -406,6 +406,7 @@ EXPERIMENTAL {
> 
>       # added in 21.02
>       rte_power_monitor;
> +     rte_power_monitor_wakeup;
>       rte_power_pause;
>  };
> 
> diff --git a/lib/librte_eal/x86/rte_power_intrinsics.c 
> b/lib/librte_eal/x86/rte_power_intrinsics.c
> index a9cd1afe9d..46a4fb6cd5 100644
> --- a/lib/librte_eal/x86/rte_power_intrinsics.c
> +++ b/lib/librte_eal/x86/rte_power_intrinsics.c
> @@ -2,8 +2,31 @@
>   * Copyright(c) 2020 Intel Corporation
>   */
> 
> +#include <rte_common.h>
> +#include <rte_lcore.h>
> +#include <rte_spinlock.h>
> +
>  #include "rte_power_intrinsics.h"
> 
> +/*
> + * Per-lcore structure holding current status of C0.2 sleeps.
> + */
> +static struct power_wait_status {
> +     rte_spinlock_t lock;
> +     volatile void *monitor_addr; /**< NULL if not currently sleeping */
> +} __rte_cache_aligned wait_status[RTE_MAX_LCORE];
> +
> +static inline void
> +__umwait_wakeup(volatile void *addr)
> +{
> +     uint64_t val;
> +
> +     /* trigger a write but don't change the value */
> +     val = __atomic_load_n((volatile uint64_t *)addr, __ATOMIC_RELAXED);
> +     __atomic_compare_exchange_n((volatile uint64_t *)addr, &val, val, 0,
> +                     __ATOMIC_RELAXED, __ATOMIC_RELAXED);
> +}
> +
>  static uint8_t wait_supported;
> 
>  static inline uint64_t
> @@ -36,6 +59,12 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc,
>  {
>       const uint32_t tsc_l = (uint32_t)tsc_timestamp;
>       const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
> +     const unsigned int lcore_id = rte_lcore_id();
> +     struct power_wait_status *s;
> +
> +     /* prevent non-EAL thread from using this API */
> +     if (lcore_id >= RTE_MAX_LCORE)
> +             return;
> 
>       /* prevent user from running this instruction if it's not supported */
>       if (!wait_supported)
> @@ -60,11 +89,24 @@ rte_power_monitor(const struct rte_power_monitor_cond 
> *pmc,
>               if (masked == pmc->val)
>                       return;
>       }
> +
> +     s = &wait_status[lcore_id];
> +
> +     /* update sleep address */
> +     rte_spinlock_lock(&s->lock);
> +     s->monitor_addr = pmc->addr;
> +     rte_spinlock_unlock(&s->lock);

It was a while, since I looked at it last time,
but shouldn't we grab the lock before monitor()?
I.E:
lock();
monitor();
addr=...;
unlock();
umwait();

> +
>       /* execute UMWAIT */
>       asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
>                       : /* ignore rflags */
>                       : "D"(0), /* enter C0.2 */
>                         "a"(tsc_l), "d"(tsc_h));
> +
> +     /* erase sleep address */
> +     rte_spinlock_lock(&s->lock);
> +     s->monitor_addr = NULL;
> +     rte_spinlock_unlock(&s->lock);
>  }
> 
>  /**
> @@ -97,3 +139,46 @@ RTE_INIT(rte_power_intrinsics_init) {
>       if (i.power_monitor && i.power_pause)
>               wait_supported = 1;
>  }
> +
> +void
> +rte_power_monitor_wakeup(const unsigned int lcore_id)
> +{
> +     struct power_wait_status *s;
> +
> +     /* prevent buffer overrun */
> +     if (lcore_id >= RTE_MAX_LCORE)
> +             return;
> +
> +     /* prevent user from running this instruction if it's not supported */
> +     if (!wait_supported)
> +             return;
> +
> +     s = &wait_status[lcore_id];
> +
> +     /*
> +      * There is a race condition between sleep, wakeup and locking, but we
> +      * don't need to handle it.
> +      *
> +      * Possible situations:
> +      *
> +      * 1. T1 locks, sets address, unlocks
> +      * 2. T2 locks, triggers wakeup, unlocks
> +      * 3. T1 sleeps
> +      *
> +      * In this case, because T1 has already set the address for monitoring,
> +      * we will wake up immediately even if T2 triggers wakeup before T1
> +      * goes to sleep.
> +      *
> +      * 1. T1 locks, sets address, unlocks, goes to sleep, and wakes up
> +      * 2. T2 locks, triggers wakeup, and unlocks
> +      * 3. T1 locks, erases address, and unlocks
> +      *
> +      * In this case, since we've already woken up, the "wakeup" was
> +      * unneeded, and since T1 is still waiting on T2 releasing the lock, the
> +      * wakeup address is still valid so it's perfectly safe to write it.
> +      */
> +     rte_spinlock_lock(&s->lock);
> +     if (s->monitor_addr != NULL)
> +             __umwait_wakeup(s->monitor_addr);
> +     rte_spinlock_unlock(&s->lock);
> +}
> --
> 2.25.1

Reply via email to