Re: [PATCH 2/2] armv8: generic_timer: Use event stream for udelay

2024-05-01 Thread Peter Hoyes

On 4/29/24 16:16, Andre Przywara wrote:

On Tue, 23 Apr 2024 09:10:05 +0100
Peter Hoyes  wrote:

Hi,


From: Peter Hoyes 

Polling cntpct_el0 in a tight loop for delays is inefficient.
This is particularly apparent on Arm FVPs, which do not simulate
real time, meaning that a 1s sleep can take a couple of orders
of magnitude longer to execute in wall time.

If running at EL2 or above (where CNTHCTL_EL2 is available), enable
the cntpct_el0 event stream temporarily and use wfe to implement
the delay more efficiently. The event period is chosen as a
trade-off between efficiency and the fact that Arm FVPs do not
typically simulate real time.

This is only implemented for Armv8 boards, where an architectural
timer exists.

Some mach-socfpga AArch64 boards already override __udelay to make
it always inline, so guard the functionality with a new
ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

So the code looks alright, and it works for me, tested on some hardware I
just had at hand.
However I am a bit worried about the scope of this patch. While it's
indeed purely architectural and should work on all systems, I think we
need to be careful with messing with the arch timer *while the OS is
running*. U-Boot code will run for UEFI runtime services and for serving
PSCI on some platforms, and I guess messing with CNTHCTL_EL2 is not a good
idea then. And udelay sounds like a basic function that this code might
use.
So I wonder if we should limit this to the Arm FVPs for now? To not create
subtle and hard-to-diagnose problems for a lot of boards?

Or maybe there is some mechanism to limit this to U-Boot boot time/UEFI
boot services?


Signed-off-by: Peter Hoyes 
---
  arch/arm/cpu/armv8/Kconfig |  8 
  arch/arm/cpu/armv8/generic_timer.c | 27 +++
  arch/arm/include/asm/system.h  |  6 --
  3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
index 9f0fb369f7..544c5e2d74 100644
--- a/arch/arm/cpu/armv8/Kconfig
+++ b/arch/arm/cpu/armv8/Kconfig
@@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
  Exception handling at all exception levels for External Abort and
  SError interrupt exception are taken in EL3.
  
+config ARMV8_UDELAY_EVENT_STREAM

+   bool "Use the event stream for udelay"
+   default y if !ARCH_SOCFPGA

As described above, change this to something like "... if ARCH_VEXPRESS64".

Cheers,
Andre


Agreed, this is limited to ARCH_VEXPRESS64 in v2.

Peter


+   help
+ Use the event stream provided by the AArch64 architectural timer for
+ delays. This is more efficient than the default polling
+ implementation.
+
  menuconfig ARMV8_CRYPTO
bool "ARM64 Accelerated Cryptographic Algorithms"
  
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c

index 8f83372cbc..e18b5c8187 100644
--- a/arch/arm/cpu/armv8/generic_timer.c
+++ b/arch/arm/cpu/armv8/generic_timer.c
@@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
  
  	return val / get_tbclk();

  }
+
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
+void __udelay(unsigned long usec)
+{
+   u64 target = get_ticks() + usec_to_tick(usec);
+
+   /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so 
often */
+   if (current_el() >= 2) {
+   u32 cnthctl_val;
+   const u8 event_period = 0x7;
+
+   asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
+   asm volatile("msr cnthctl_el2, %0" : : "r"
+   (cnthctl_val | CNTHCTL_EL2_EVNT_EN | 
CNTHCTL_EL2_EVNT_I(event_period)));
+
+   while (get_ticks() + (1ULL << event_period) <= target)
+   wfe();
+
+   /* Reset the event stream */
+   asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
+   }
+
+   /* Fall back to polling CNTPCT_EL0 */
+   while (get_ticks() <= target)
+   ;
+}
+#endif
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 51123c2968..7e30cac32a 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -69,8 +69,10 @@
  /*
   * CNTHCTL_EL2 bits definitions
   */
-#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1)  /* Physical timer regs accessible   */
-#define CNTHCTL_EL2_EL1PCTEN_EN(1 << 0)  /* Physical counter 
accessible  */
+#define CNTHCTL_EL2_EVNT_ENBIT(2)   /* Enable the event stream   */
+#define CNTHCTL_EL2_EVNT_I(val)((val) << 4) /* Event stream trigger 
bits */
+#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible 
*/
+#define CNTHCTL_EL2_EL1PCTEN_EN(1 << 0) /* Physical counter 
accessible   */
  
  /*

   * HCR_EL2 bits definitions


Re: [PATCH 2/2] armv8: generic_timer: Use event stream for udelay

2024-04-29 Thread Andre Przywara
On Tue, 23 Apr 2024 09:10:05 +0100
Peter Hoyes  wrote:

Hi,

> From: Peter Hoyes 
> 
> Polling cntpct_el0 in a tight loop for delays is inefficient.
> This is particularly apparent on Arm FVPs, which do not simulate
> real time, meaning that a 1s sleep can take a couple of orders
> of magnitude longer to execute in wall time.
> 
> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
> the cntpct_el0 event stream temporarily and use wfe to implement
> the delay more efficiently. The event period is chosen as a
> trade-off between efficiency and the fact that Arm FVPs do not
> typically simulate real time.
> 
> This is only implemented for Armv8 boards, where an architectural
> timer exists.
> 
> Some mach-socfpga AArch64 boards already override __udelay to make
> it always inline, so guard the functionality with a new
> ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

So the code looks alright, and it works for me, tested on some hardware I
just had at hand.
However I am a bit worried about the scope of this patch. While it's
indeed purely architectural and should work on all systems, I think we
need to be careful with messing with the arch timer *while the OS is
running*. U-Boot code will run for UEFI runtime services and for serving
PSCI on some platforms, and I guess messing with CNTHCTL_EL2 is not a good
idea then. And udelay sounds like a basic function that this code might
use.
So I wonder if we should limit this to the Arm FVPs for now? To not create
subtle and hard-to-diagnose problems for a lot of boards?

Or maybe there is some mechanism to limit this to U-Boot boot time/UEFI
boot services?

> Signed-off-by: Peter Hoyes 
> ---
>  arch/arm/cpu/armv8/Kconfig |  8 
>  arch/arm/cpu/armv8/generic_timer.c | 27 +++
>  arch/arm/include/asm/system.h  |  6 --
>  3 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
> index 9f0fb369f7..544c5e2d74 100644
> --- a/arch/arm/cpu/armv8/Kconfig
> +++ b/arch/arm/cpu/armv8/Kconfig
> @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
> Exception handling at all exception levels for External Abort and
> SError interrupt exception are taken in EL3.
>  
> +config ARMV8_UDELAY_EVENT_STREAM
> + bool "Use the event stream for udelay"
> + default y if !ARCH_SOCFPGA

As described above, change this to something like "... if ARCH_VEXPRESS64".

Cheers,
Andre

> + help
> +   Use the event stream provided by the AArch64 architectural timer for
> +   delays. This is more efficient than the default polling
> +   implementation.
> +
>  menuconfig ARMV8_CRYPTO
>   bool "ARM64 Accelerated Cryptographic Algorithms"
>  
> diff --git a/arch/arm/cpu/armv8/generic_timer.c 
> b/arch/arm/cpu/armv8/generic_timer.c
> index 8f83372cbc..e18b5c8187 100644
> --- a/arch/arm/cpu/armv8/generic_timer.c
> +++ b/arch/arm/cpu/armv8/generic_timer.c
> @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
>  
>   return val / get_tbclk();
>  }
> +
> +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
> +void __udelay(unsigned long usec)
> +{
> + u64 target = get_ticks() + usec_to_tick(usec);
> +
> + /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so 
> often */
> + if (current_el() >= 2) {
> + u32 cnthctl_val;
> + const u8 event_period = 0x7;
> +
> + asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
> + asm volatile("msr cnthctl_el2, %0" : : "r"
> + (cnthctl_val | CNTHCTL_EL2_EVNT_EN | 
> CNTHCTL_EL2_EVNT_I(event_period)));
> +
> + while (get_ticks() + (1ULL << event_period) <= target)
> + wfe();
> +
> + /* Reset the event stream */
> + asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
> + }
> +
> + /* Fall back to polling CNTPCT_EL0 */
> + while (get_ticks() <= target)
> + ;
> +}
> +#endif
> diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
> index 51123c2968..7e30cac32a 100644
> --- a/arch/arm/include/asm/system.h
> +++ b/arch/arm/include/asm/system.h
> @@ -69,8 +69,10 @@
>  /*
>   * CNTHCTL_EL2 bits definitions
>   */
> -#define CNTHCTL_EL2_EL1PCEN_EN   (1 << 1)  /* Physical timer regs 
> accessible   */
> -#define CNTHCTL_EL2_EL1PCTEN_EN  (1 << 0)  /* Physical counter 
> accessible  */
> +#define CNTHCTL_EL2_EVNT_EN  BIT(2)   /* Enable the event stream   */
> +#define CNTHCTL_EL2_EVNT_I(val)  ((val) << 4) /* Event stream trigger 
> bits */
> +#define CNTHCTL_EL2_EL1PCEN_EN   (1 << 1) /* Physical timer regs 
> accessible */
> +#define CNTHCTL_EL2_EL1PCTEN_EN  (1 << 0) /* Physical counter 
> accessible   */
>  
>  /*
>   * HCR_EL2 bits definitions



Re: [PATCH 2/2] armv8: generic_timer: Use event stream for udelay

2024-04-24 Thread Quentin Schulz

Hi Peter, Andre,

On 4/24/24 12:29, Andre Przywara wrote:

On Tue, 23 Apr 2024 12:55:55 +0200
Quentin Schulz  wrote:


Hi Peter,

On 4/23/24 10:10, Peter Hoyes wrote:

From: Peter Hoyes 

Polling cntpct_el0 in a tight loop for delays is inefficient.
This is particularly apparent on Arm FVPs, which do not simulate
real time, meaning that a 1s sleep can take a couple of orders
of magnitude longer to execute in wall time.

If running at EL2 or above (where CNTHCTL_EL2 is available), enable
the cntpct_el0 event stream temporarily and use wfe to implement
the delay more efficiently. The event period is chosen as a
trade-off between efficiency and the fact that Arm FVPs do not
typically simulate real time.

This is only implemented for Armv8 boards, where an architectural
timer exists.

Some mach-socfpga AArch64 boards already override __udelay to make
it always inline, so guard the functionality with a new
ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

Signed-off-by: Peter Hoyes 
---
   arch/arm/cpu/armv8/Kconfig |  8 
   arch/arm/cpu/armv8/generic_timer.c | 27 +++
   arch/arm/include/asm/system.h  |  6 --
   3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
index 9f0fb369f7..544c5e2d74 100644
--- a/arch/arm/cpu/armv8/Kconfig
+++ b/arch/arm/cpu/armv8/Kconfig
@@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
  Exception handling at all exception levels for External Abort and
  SError interrupt exception are taken in EL3.
   
+config ARMV8_UDELAY_EVENT_STREAM

+   bool "Use the event stream for udelay"
+   default y if !ARCH_SOCFPGA
+   help
+ Use the event stream provided by the AArch64 architectural timer for
+ delays. This is more efficient than the default polling
+ implementation.
+
   menuconfig ARMV8_CRYPTO
bool "ARM64 Accelerated Cryptographic Algorithms"
   
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c

index 8f83372cbc..e18b5c8187 100644
--- a/arch/arm/cpu/armv8/generic_timer.c
+++ b/arch/arm/cpu/armv8/generic_timer.c
@@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
   
   	return val / get_tbclk();

   }
+
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
+void __udelay(unsigned long usec)
+{
+   u64 target = get_ticks() + usec_to_tick(usec);
+


This can theoretically overflow, do we have any guarantee this cannot
happen in real life, like... we would need U-Boot to be running for 100
years without being powered-down/reset or something like that? Can we
document this assumption? Does this make sense?


The Arm ARM guarantees a "Roll-over time of not less than 40 years."
(Armv8 ARM 0487K.a D12.1.2 "The system counter").
So that's not the 100 years you are asking for, but I guess still good
enough?



I guess it is, since it is also stored in a u64 and is reset to 0 upon 
start-up according to the ARM. I also assume nobody is going to add a 
udelay of years in their code :) (and if they do, they would probably 
figure out something's wrong before it reaches the final products :) ).


Thanks all for the pointers to reference manuals and current 
implementations.


Cheers,
Quentin


Re: [PATCH 2/2] armv8: generic_timer: Use event stream for udelay

2024-04-24 Thread Andre Przywara
On Tue, 23 Apr 2024 12:55:55 +0200
Quentin Schulz  wrote:

> Hi Peter,
> 
> On 4/23/24 10:10, Peter Hoyes wrote:
> > From: Peter Hoyes 
> > 
> > Polling cntpct_el0 in a tight loop for delays is inefficient.
> > This is particularly apparent on Arm FVPs, which do not simulate
> > real time, meaning that a 1s sleep can take a couple of orders
> > of magnitude longer to execute in wall time.
> > 
> > If running at EL2 or above (where CNTHCTL_EL2 is available), enable
> > the cntpct_el0 event stream temporarily and use wfe to implement
> > the delay more efficiently. The event period is chosen as a
> > trade-off between efficiency and the fact that Arm FVPs do not
> > typically simulate real time.
> > 
> > This is only implemented for Armv8 boards, where an architectural
> > timer exists.
> > 
> > Some mach-socfpga AArch64 boards already override __udelay to make
> > it always inline, so guard the functionality with a new
> > ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.
> > 
> > Signed-off-by: Peter Hoyes 
> > ---
> >   arch/arm/cpu/armv8/Kconfig |  8 
> >   arch/arm/cpu/armv8/generic_timer.c | 27 +++
> >   arch/arm/include/asm/system.h  |  6 --
> >   3 files changed, 39 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
> > index 9f0fb369f7..544c5e2d74 100644
> > --- a/arch/arm/cpu/armv8/Kconfig
> > +++ b/arch/arm/cpu/armv8/Kconfig
> > @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
> >   Exception handling at all exception levels for External Abort and
> >   SError interrupt exception are taken in EL3.
> >   
> > +config ARMV8_UDELAY_EVENT_STREAM
> > +   bool "Use the event stream for udelay"
> > +   default y if !ARCH_SOCFPGA
> > +   help
> > + Use the event stream provided by the AArch64 architectural timer for
> > + delays. This is more efficient than the default polling
> > + implementation.
> > +
> >   menuconfig ARMV8_CRYPTO
> > bool "ARM64 Accelerated Cryptographic Algorithms"
> >   
> > diff --git a/arch/arm/cpu/armv8/generic_timer.c 
> > b/arch/arm/cpu/armv8/generic_timer.c
> > index 8f83372cbc..e18b5c8187 100644
> > --- a/arch/arm/cpu/armv8/generic_timer.c
> > +++ b/arch/arm/cpu/armv8/generic_timer.c
> > @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
> >   
> > return val / get_tbclk();
> >   }
> > +
> > +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
> > +void __udelay(unsigned long usec)
> > +{
> > +   u64 target = get_ticks() + usec_to_tick(usec);
> > +  
> 
> This can theoretically overflow, do we have any guarantee this cannot 
> happen in real life, like... we would need U-Boot to be running for 100 
> years without being powered-down/reset or something like that? Can we 
> document this assumption? Does this make sense?

The Arm ARM guarantees a "Roll-over time of not less than 40 years."
(Armv8 ARM 0487K.a D12.1.2 "The system counter").
So that's not the 100 years you are asking for, but I guess still good
enough?

Cheers,
Andre

> 
> > +   /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so 
> > often */
> > +   if (current_el() >= 2) {
> > +   u32 cnthctl_val;
> > +   const u8 event_period = 0x7;
> > +
> > +   asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
> > +   asm volatile("msr cnthctl_el2, %0" : : "r"
> > +   (cnthctl_val | CNTHCTL_EL2_EVNT_EN | 
> > CNTHCTL_EL2_EVNT_I(event_period)));
> > +
> > +   while (get_ticks() + (1ULL << event_period) <= target)  
> 
> This could be an overflow as well.
> 
> > +   wfe();
> > +
> > +   /* Reset the event stream */
> > +   asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
> > +   }
> > +
> > +   /* Fall back to polling CNTPCT_EL0 */
> > +   while (get_ticks() <= target)  
> 
> get_ticks() could wrap around here maybe?
> 
> Cheers,
> Quentin



Re: [PATCH 2/2] armv8: generic_timer: Use event stream for udelay

2024-04-24 Thread Peter Hoyes

Hi Quentin,

On 4/23/24 11:55, Quentin Schulz wrote:

Hi Peter,

On 4/23/24 10:10, Peter Hoyes wrote:

From: Peter Hoyes 

Polling cntpct_el0 in a tight loop for delays is inefficient.
This is particularly apparent on Arm FVPs, which do not simulate
real time, meaning that a 1s sleep can take a couple of orders
of magnitude longer to execute in wall time.

If running at EL2 or above (where CNTHCTL_EL2 is available), enable
the cntpct_el0 event stream temporarily and use wfe to implement
the delay more efficiently. The event period is chosen as a
trade-off between efficiency and the fact that Arm FVPs do not
typically simulate real time.

This is only implemented for Armv8 boards, where an architectural
timer exists.

Some mach-socfpga AArch64 boards already override __udelay to make
it always inline, so guard the functionality with a new
ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

Signed-off-by: Peter Hoyes 
---
  arch/arm/cpu/armv8/Kconfig |  8 
  arch/arm/cpu/armv8/generic_timer.c | 27 +++
  arch/arm/include/asm/system.h  |  6 --
  3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
index 9f0fb369f7..544c5e2d74 100644
--- a/arch/arm/cpu/armv8/Kconfig
+++ b/arch/arm/cpu/armv8/Kconfig
@@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
    Exception handling at all exception levels for External Abort 
and

    SError interrupt exception are taken in EL3.
  +config ARMV8_UDELAY_EVENT_STREAM
+    bool "Use the event stream for udelay"
+    default y if !ARCH_SOCFPGA
+    help
+  Use the event stream provided by the AArch64 architectural 
timer for

+  delays. This is more efficient than the default polling
+  implementation.
+
  menuconfig ARMV8_CRYPTO
  bool "ARM64 Accelerated Cryptographic Algorithms"
  diff --git a/arch/arm/cpu/armv8/generic_timer.c 
b/arch/arm/cpu/armv8/generic_timer.c

index 8f83372cbc..e18b5c8187 100644
--- a/arch/arm/cpu/armv8/generic_timer.c
+++ b/arch/arm/cpu/armv8/generic_timer.c
@@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
    return val / get_tbclk();
  }
+
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
+void __udelay(unsigned long usec)
+{
+    u64 target = get_ticks() + usec_to_tick(usec);
+


This can theoretically overflow, do we have any guarantee this cannot 
happen in real life, like... we would need U-Boot to be running for 
100 years without being powered-down/reset or something like that? Can 
we document this assumption? Does this make sense?


+    /* At EL2 or above, use the event stream to avoid polling 
CNTPCT_EL0 so often */

+    if (current_el() >= 2) {
+    u32 cnthctl_val;
+    const u8 event_period = 0x7;
+
+    asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
+    asm volatile("msr cnthctl_el2, %0" : : "r"
+    (cnthctl_val | CNTHCTL_EL2_EVNT_EN | 
CNTHCTL_EL2_EVNT_I(event_period)));

+
+    while (get_ticks() + (1ULL << event_period) <= target)


This could be an overflow as well.


+    wfe();
+
+    /* Reset the event stream */
+    asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
+    }
+
+    /* Fall back to polling CNTPCT_EL0 */
+    while (get_ticks() <= target)


get_ticks() could wrap around here maybe?
I don't see a problem here. It's using u64s and the maximum 
COUNTER_FREQUENCY in the U-Boot source is 200MHz, which gives enough 
ticks for about 3 millennia.


Additionally, the underlying "while" condition is the same as the 
existing weak __udelay implementation
(https://source.denx.de/u-boot/u-boot/-/blob/master/lib/time.c?ref_type=heads#L181), 
so this change doesn't affect the overflow limit.


Thanks,
Peter


Re: [PATCH 2/2] armv8: generic_timer: Use event stream for udelay

2024-04-23 Thread Quentin Schulz

Hi Peter,

On 4/23/24 10:10, Peter Hoyes wrote:

From: Peter Hoyes 

Polling cntpct_el0 in a tight loop for delays is inefficient.
This is particularly apparent on Arm FVPs, which do not simulate
real time, meaning that a 1s sleep can take a couple of orders
of magnitude longer to execute in wall time.

If running at EL2 or above (where CNTHCTL_EL2 is available), enable
the cntpct_el0 event stream temporarily and use wfe to implement
the delay more efficiently. The event period is chosen as a
trade-off between efficiency and the fact that Arm FVPs do not
typically simulate real time.

This is only implemented for Armv8 boards, where an architectural
timer exists.

Some mach-socfpga AArch64 boards already override __udelay to make
it always inline, so guard the functionality with a new
ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

Signed-off-by: Peter Hoyes 
---
  arch/arm/cpu/armv8/Kconfig |  8 
  arch/arm/cpu/armv8/generic_timer.c | 27 +++
  arch/arm/include/asm/system.h  |  6 --
  3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
index 9f0fb369f7..544c5e2d74 100644
--- a/arch/arm/cpu/armv8/Kconfig
+++ b/arch/arm/cpu/armv8/Kconfig
@@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
  Exception handling at all exception levels for External Abort and
  SError interrupt exception are taken in EL3.
  
+config ARMV8_UDELAY_EVENT_STREAM

+   bool "Use the event stream for udelay"
+   default y if !ARCH_SOCFPGA
+   help
+ Use the event stream provided by the AArch64 architectural timer for
+ delays. This is more efficient than the default polling
+ implementation.
+
  menuconfig ARMV8_CRYPTO
bool "ARM64 Accelerated Cryptographic Algorithms"
  
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c

index 8f83372cbc..e18b5c8187 100644
--- a/arch/arm/cpu/armv8/generic_timer.c
+++ b/arch/arm/cpu/armv8/generic_timer.c
@@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
  
  	return val / get_tbclk();

  }
+
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
+void __udelay(unsigned long usec)
+{
+   u64 target = get_ticks() + usec_to_tick(usec);
+


This can theoretically overflow, do we have any guarantee this cannot 
happen in real life, like... we would need U-Boot to be running for 100 
years without being powered-down/reset or something like that? Can we 
document this assumption? Does this make sense?



+   /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so 
often */
+   if (current_el() >= 2) {
+   u32 cnthctl_val;
+   const u8 event_period = 0x7;
+
+   asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
+   asm volatile("msr cnthctl_el2, %0" : : "r"
+   (cnthctl_val | CNTHCTL_EL2_EVNT_EN | 
CNTHCTL_EL2_EVNT_I(event_period)));
+
+   while (get_ticks() + (1ULL << event_period) <= target)


This could be an overflow as well.


+   wfe();
+
+   /* Reset the event stream */
+   asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
+   }
+
+   /* Fall back to polling CNTPCT_EL0 */
+   while (get_ticks() <= target)


get_ticks() could wrap around here maybe?

Cheers,
Quentin


[PATCH 2/2] armv8: generic_timer: Use event stream for udelay

2024-04-23 Thread Peter Hoyes
From: Peter Hoyes 

Polling cntpct_el0 in a tight loop for delays is inefficient.
This is particularly apparent on Arm FVPs, which do not simulate
real time, meaning that a 1s sleep can take a couple of orders
of magnitude longer to execute in wall time.

If running at EL2 or above (where CNTHCTL_EL2 is available), enable
the cntpct_el0 event stream temporarily and use wfe to implement
the delay more efficiently. The event period is chosen as a
trade-off between efficiency and the fact that Arm FVPs do not
typically simulate real time.

This is only implemented for Armv8 boards, where an architectural
timer exists.

Some mach-socfpga AArch64 boards already override __udelay to make
it always inline, so guard the functionality with a new
ARMV8_UDELAY_EVENT_STREAM Kconfig, enabled by default.

Signed-off-by: Peter Hoyes 
---
 arch/arm/cpu/armv8/Kconfig |  8 
 arch/arm/cpu/armv8/generic_timer.c | 27 +++
 arch/arm/include/asm/system.h  |  6 --
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
index 9f0fb369f7..544c5e2d74 100644
--- a/arch/arm/cpu/armv8/Kconfig
+++ b/arch/arm/cpu/armv8/Kconfig
@@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
  Exception handling at all exception levels for External Abort and
  SError interrupt exception are taken in EL3.
 
+config ARMV8_UDELAY_EVENT_STREAM
+   bool "Use the event stream for udelay"
+   default y if !ARCH_SOCFPGA
+   help
+ Use the event stream provided by the AArch64 architectural timer for
+ delays. This is more efficient than the default polling
+ implementation.
+
 menuconfig ARMV8_CRYPTO
bool "ARM64 Accelerated Cryptographic Algorithms"
 
diff --git a/arch/arm/cpu/armv8/generic_timer.c 
b/arch/arm/cpu/armv8/generic_timer.c
index 8f83372cbc..e18b5c8187 100644
--- a/arch/arm/cpu/armv8/generic_timer.c
+++ b/arch/arm/cpu/armv8/generic_timer.c
@@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
 
return val / get_tbclk();
 }
+
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
+void __udelay(unsigned long usec)
+{
+   u64 target = get_ticks() + usec_to_tick(usec);
+
+   /* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so 
often */
+   if (current_el() >= 2) {
+   u32 cnthctl_val;
+   const u8 event_period = 0x7;
+
+   asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
+   asm volatile("msr cnthctl_el2, %0" : : "r"
+   (cnthctl_val | CNTHCTL_EL2_EVNT_EN | 
CNTHCTL_EL2_EVNT_I(event_period)));
+
+   while (get_ticks() + (1ULL << event_period) <= target)
+   wfe();
+
+   /* Reset the event stream */
+   asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
+   }
+
+   /* Fall back to polling CNTPCT_EL0 */
+   while (get_ticks() <= target)
+   ;
+}
+#endif
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 51123c2968..7e30cac32a 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -69,8 +69,10 @@
 /*
  * CNTHCTL_EL2 bits definitions
  */
-#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1)  /* Physical timer regs accessible   */
-#define CNTHCTL_EL2_EL1PCTEN_EN(1 << 0)  /* Physical counter 
accessible  */
+#define CNTHCTL_EL2_EVNT_ENBIT(2)   /* Enable the event stream   */
+#define CNTHCTL_EL2_EVNT_I(val)((val) << 4) /* Event stream trigger 
bits */
+#define CNTHCTL_EL2_EL1PCEN_EN (1 << 1) /* Physical timer regs accessible 
*/
+#define CNTHCTL_EL2_EL1PCTEN_EN(1 << 0) /* Physical counter 
accessible   */
 
 /*
  * HCR_EL2 bits definitions
-- 
2.34.1