Re: [PATCH] irqchip/gic-v4.1: Optimize the wait for the completion of the analysis of the VPT

2020-11-27 Thread Marc Zyngier

Shenming,

Somehow this patch ended up in the wrong folder.
Apologies for the delay reviewing it.

On 2020-09-23 07:35, Shenming Lu wrote:

Right after a vPE is made resident, the code starts polling the
GICR_VPENDBASER.Dirty bit until it becomes 0, where the delay_us
is set to 10. But in our measurement, it takes only hundreds of
nanoseconds, or 1~2 microseconds, to finish parsing the VPT in most
cases. And we also measured the time from vcpu_load() (include it)
to __guest_enter() on Kunpeng 920. On average, it takes 2.55 
microseconds

(not first run && the VPT is empty). So 10 microseconds delay might
really hurt performance.

To avoid this, we can set the delay_us to 1, which is more appropriate
in this situation and universal. Besides, we can delay the execution
of its_wait_vpt_parse_complete() (call it from kvm_vgic_flush_hwstate()
corresponding to vPE resident), giving the GIC a chance to work in
parallel with the CPU on the entry path.

Signed-off-by: Shenming Lu 
---
 arch/arm64/kvm/vgic/vgic-v4.c  | 18 ++
 arch/arm64/kvm/vgic/vgic.c |  2 ++
 drivers/irqchip/irq-gic-v3-its.c   | 14 +++---
 drivers/irqchip/irq-gic-v4.c   | 11 +++
 include/kvm/arm_vgic.h |  3 +++
 include/linux/irqchip/arm-gic-v4.h |  4 
 6 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v4.c 
b/arch/arm64/kvm/vgic/vgic-v4.c

index b5fa73c9fd35..1d5d2d6894d3 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -353,6 +353,24 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
return err;
 }

+void vgic_v4_wait_vpt(struct kvm_vcpu *vcpu)


I'd like something a bit more abstract as a name.

vgic_v4_commit() seems more appropriate, and could be used for other
purposes.


+{
+   struct its_vpe *vpe;
+
+   if (kvm_vgic_global_state.type == VGIC_V2 ||


Why do you test for GICv2? Isn't the vgic_supports_direct_msis() test 
enough?
And the test should be moved to kvm_vgic_flush_hwstate(), as we already 
have

similar checks there.


!vgic_supports_direct_msis(vcpu->kvm))
+   return;
+
+   vpe = >arch.vgic_cpu.vgic_v3.its_vpe;
+
+   if (vpe->vpt_ready)
+   return;
+
+   if (its_wait_vpt(vpe))
+   return;


How can that happen?


+
+   vpe->vpt_ready = true;


This is nasty. You need to explain what happens with this state (you are
trying not to access VPENDBASER across a shallow exit, as only a 
vcpu_put

will invalidate the GIC state). And something like vpe_ready is more
generic (we try not to have too much of the GICv4 gunk in the KVM code).


+}
+
 static struct vgic_its *vgic_get_its(struct kvm *kvm,
 struct kvm_kernel_irq_routing_entry 
*irq_entry)
 {
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index c3643b7f101b..ed810a80cda2 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -915,6 +915,8 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)

if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
+
+   vgic_v4_wait_vpt(vcpu);
 }

 void kvm_vgic_load(struct kvm_vcpu *vcpu)
diff --git a/drivers/irqchip/irq-gic-v3-its.c 
b/drivers/irqchip/irq-gic-v3-its.c

index 548de7538632..b7cbc9bcab9d 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3803,7 +3803,7 @@ static void its_wait_vpt_parse_complete(void)
 	WARN_ON_ONCE(readq_relaxed_poll_timeout_atomic(vlpi_base + 
GICR_VPENDBASER,

   val,
   !(val & 
GICR_VPENDBASER_Dirty),
-  10, 500));
+  1, 500));


This really should be in a separate patch.


 }

 static void its_vpe_schedule(struct its_vpe *vpe)
@@ -3837,7 +3837,7 @@ static void its_vpe_schedule(struct its_vpe *vpe)
val |= GICR_VPENDBASER_Valid;
gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);

-   its_wait_vpt_parse_complete();
+   vpe->vpt_ready = false;


This really belongs to the deschedule path, doesn't it? Given that
it can only be set from vgic_flush_hwstate(), it should be fairly
foolproof.


 }

 static void its_vpe_deschedule(struct its_vpe *vpe)
@@ -3881,6 +3881,10 @@ static int its_vpe_set_vcpu_affinity(struct
irq_data *d, void *vcpu_info)
its_vpe_schedule(vpe);
return 0;

+   case WAIT_VPT:


COMMIT_VPE seems a better name.


+   its_wait_vpt_parse_complete();
+   return 0;
+
case DESCHEDULE_VPE:
its_vpe_deschedule(vpe);
return 0;
@@ -4047,7 +4051,7 @@ static void its_vpe_4_1_schedule(struct its_vpe 
*vpe,


gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);

-   its_wait_vpt_parse_complete();
+   vpe->vpt_ready = 

Re: [PATCH] irqchip/gic-v4.1: Optimize the wait for the completion of the analysis of the VPT

2020-11-27 Thread Shenming Lu
On 2020/11/28 3:35, Marc Zyngier wrote:
> Shenming,
> 
> Somehow this patch ended up in the wrong folder.
> Apologies for the delay reviewing it.>
> On 2020-09-23 07:35, Shenming Lu wrote:
>> Right after a vPE is made resident, the code starts polling the
>> GICR_VPENDBASER.Dirty bit until it becomes 0, where the delay_us
>> is set to 10. But in our measurement, it takes only hundreds of
>> nanoseconds, or 1~2 microseconds, to finish parsing the VPT in most
>> cases. And we also measured the time from vcpu_load() (include it)
>> to __guest_enter() on Kunpeng 920. On average, it takes 2.55 microseconds
>> (not first run && the VPT is empty). So 10 microseconds delay might
>> really hurt performance.
>>
>> To avoid this, we can set the delay_us to 1, which is more appropriate
>> in this situation and universal. Besides, we can delay the execution
>> of its_wait_vpt_parse_complete() (call it from kvm_vgic_flush_hwstate()
>> corresponding to vPE resident), giving the GIC a chance to work in
>> parallel with the CPU on the entry path.
>>
>> Signed-off-by: Shenming Lu 
>> ---
>>  arch/arm64/kvm/vgic/vgic-v4.c  | 18 ++
>>  arch/arm64/kvm/vgic/vgic.c |  2 ++
>>  drivers/irqchip/irq-gic-v3-its.c   | 14 +++---
>>  drivers/irqchip/irq-gic-v4.c   | 11 +++
>>  include/kvm/arm_vgic.h |  3 +++
>>  include/linux/irqchip/arm-gic-v4.h |  4 
>>  6 files changed, 49 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
>> index b5fa73c9fd35..1d5d2d6894d3 100644
>> --- a/arch/arm64/kvm/vgic/vgic-v4.c
>> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
>> @@ -353,6 +353,24 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
>>  return err;
>>  }
>>
>> +void vgic_v4_wait_vpt(struct kvm_vcpu *vcpu)
> 
> I'd like something a bit more abstract as a name.
> 
> vgic_v4_commit() seems more appropriate, and could be used for other
> purposes.

Yes, it looks more appropriate.

> 
>> +{
>> +    struct its_vpe *vpe;
>> +
>> +    if (kvm_vgic_global_state.type == VGIC_V2 ||
> 
> Why do you test for GICv2? Isn't the vgic_supports_direct_msis() test enough?
> And the test should be moved to kvm_vgic_flush_hwstate(), as we already have
> similar checks there.

Yes, the test for GICv2 is unnecessary I will correct it.

> 
>> !vgic_supports_direct_msis(vcpu->kvm))
>> +    return;
>> +
>> +    vpe = >arch.vgic_cpu.vgic_v3.its_vpe;
>> +
>> +    if (vpe->vpt_ready)
>> +    return;
>> +
>> +    if (its_wait_vpt(vpe))
>> +    return;
> 
> How can that happen?

Yes, it seems that its_wait_vpt() would always return 0.

> 
>> +
>> +    vpe->vpt_ready = true;
> 
> This is nasty. You need to explain what happens with this state (you are
> trying not to access VPENDBASER across a shallow exit, as only a vcpu_put

Ok, I will add a comment here.

> will invalidate the GIC state). And something like vpe_ready is more
> generic (we try not to have too much of the GICv4 gunk in the KVM code).

Yes, that's better.

> 
>> +}
>> +
>>  static struct vgic_its *vgic_get_its(struct kvm *kvm,
>>   struct kvm_kernel_irq_routing_entry *irq_entry)
>>  {
>> diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
>> index c3643b7f101b..ed810a80cda2 100644
>> --- a/arch/arm64/kvm/vgic/vgic.c
>> +++ b/arch/arm64/kvm/vgic/vgic.c
>> @@ -915,6 +915,8 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
>>
>>  if (can_access_vgic_from_kernel())
>>  vgic_restore_state(vcpu);
>> +
>> +    vgic_v4_wait_vpt(vcpu);
>>  }
>>
>>  void kvm_vgic_load(struct kvm_vcpu *vcpu)
>> diff --git a/drivers/irqchip/irq-gic-v3-its.c 
>> b/drivers/irqchip/irq-gic-v3-its.c
>> index 548de7538632..b7cbc9bcab9d 100644
>> --- a/drivers/irqchip/irq-gic-v3-its.c
>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>> @@ -3803,7 +3803,7 @@ static void its_wait_vpt_parse_complete(void)
>>  WARN_ON_ONCE(readq_relaxed_poll_timeout_atomic(vlpi_base + 
>> GICR_VPENDBASER,
>>     val,
>>     !(val & GICR_VPENDBASER_Dirty),
>> -   10, 500));
>> +   1, 500));
> 
> This really should be in a separate patch.

Ok, I will separate it.

> 
>>  }
>>
>>  static void its_vpe_schedule(struct its_vpe *vpe)
>> @@ -3837,7 +3837,7 @@ static void its_vpe_schedule(struct its_vpe *vpe)
>>  val |= GICR_VPENDBASER_Valid;
>>  gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
>>
>> -    its_wait_vpt_parse_complete();
>> +    vpe->vpt_ready = false;
> 
> This really belongs to the deschedule path, doesn't it? Given that
> it can only be set from vgic_flush_hwstate(), it should be fairly
> foolproof.

Yes, that's better.

> 
>>  }
>>
>>  static void its_vpe_deschedule(struct its_vpe *vpe)
>> @@ -3881,6 +3881,10 @@ static int its_vpe_set_vcpu_affinity(struct
>> irq_data *d, void *vcpu_info)
>>  its_vpe_schedule(vpe);
>>  return 0;
>>
>> +    

Re: [PATCH] irqchip/gic-v4.1: Optimize the wait for the completion of the analysis of the VPT

2020-11-15 Thread Shenming Lu
Hi Marc,

Friendly ping, it is some time since I sent this patch according to your last 
advice...

Besides, recently we found that the mmio delay on GICv4.1 system is about 10 
times higher
than that on GICv4.0 system in kvm-unit-tests (the specific data is as 
follows). By the
way, HiSilicon GICv4.1 has already been implemented and will be released with 
our
next-generation server, which is almost the only implementation of GICv4.1 at 
present.

|   GICv4.1 emulator   |  GICv4.0 emulator
mmio_read_user (ns) |12811 |1598

After analysis, this is mainly caused by the 10 us delay in 
its_wait_vpt_parse_complete()
(the above difference is just about 10 us)...

What's your opinion about this?

Thanks,
Shenming

On 2020/9/23 14:35, Shenming Lu wrote:
> Right after a vPE is made resident, the code starts polling the
> GICR_VPENDBASER.Dirty bit until it becomes 0, where the delay_us
> is set to 10. But in our measurement, it takes only hundreds of
> nanoseconds, or 1~2 microseconds, to finish parsing the VPT in most
> cases. And we also measured the time from vcpu_load() (include it)
> to __guest_enter() on Kunpeng 920. On average, it takes 2.55 microseconds
> (not first run && the VPT is empty). So 10 microseconds delay might
> really hurt performance.
> 
> To avoid this, we can set the delay_us to 1, which is more appropriate
> in this situation and universal. Besides, we can delay the execution
> of its_wait_vpt_parse_complete() (call it from kvm_vgic_flush_hwstate()
> corresponding to vPE resident), giving the GIC a chance to work in
> parallel with the CPU on the entry path.
> 
> Signed-off-by: Shenming Lu 
> ---
>  arch/arm64/kvm/vgic/vgic-v4.c  | 18 ++
>  arch/arm64/kvm/vgic/vgic.c |  2 ++
>  drivers/irqchip/irq-gic-v3-its.c   | 14 +++---
>  drivers/irqchip/irq-gic-v4.c   | 11 +++
>  include/kvm/arm_vgic.h |  3 +++
>  include/linux/irqchip/arm-gic-v4.h |  4 
>  6 files changed, 49 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
> index b5fa73c9fd35..1d5d2d6894d3 100644
> --- a/arch/arm64/kvm/vgic/vgic-v4.c
> +++ b/arch/arm64/kvm/vgic/vgic-v4.c
> @@ -353,6 +353,24 @@ int vgic_v4_load(struct kvm_vcpu *vcpu)
>   return err;
>  }
>  
> +void vgic_v4_wait_vpt(struct kvm_vcpu *vcpu)
> +{
> + struct its_vpe *vpe;
> +
> + if (kvm_vgic_global_state.type == VGIC_V2 || 
> !vgic_supports_direct_msis(vcpu->kvm))
> + return;
> +
> + vpe = >arch.vgic_cpu.vgic_v3.its_vpe;
> +
> + if (vpe->vpt_ready)
> + return;
> +
> + if (its_wait_vpt(vpe))
> + return;
> +
> + vpe->vpt_ready = true;
> +}
> +
>  static struct vgic_its *vgic_get_its(struct kvm *kvm,
>struct kvm_kernel_irq_routing_entry 
> *irq_entry)
>  {
> diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
> index c3643b7f101b..ed810a80cda2 100644
> --- a/arch/arm64/kvm/vgic/vgic.c
> +++ b/arch/arm64/kvm/vgic/vgic.c
> @@ -915,6 +915,8 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
>  
>   if (can_access_vgic_from_kernel())
>   vgic_restore_state(vcpu);
> +
> + vgic_v4_wait_vpt(vcpu);
>  }
>  
>  void kvm_vgic_load(struct kvm_vcpu *vcpu)
> diff --git a/drivers/irqchip/irq-gic-v3-its.c 
> b/drivers/irqchip/irq-gic-v3-its.c
> index 548de7538632..b7cbc9bcab9d 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -3803,7 +3803,7 @@ static void its_wait_vpt_parse_complete(void)
>   WARN_ON_ONCE(readq_relaxed_poll_timeout_atomic(vlpi_base + 
> GICR_VPENDBASER,
>  val,
>  !(val & 
> GICR_VPENDBASER_Dirty),
> -10, 500));
> +1, 500));
>  }
>  
>  static void its_vpe_schedule(struct its_vpe *vpe)
> @@ -3837,7 +3837,7 @@ static void its_vpe_schedule(struct its_vpe *vpe)
>   val |= GICR_VPENDBASER_Valid;
>   gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
>  
> - its_wait_vpt_parse_complete();
> + vpe->vpt_ready = false;
>  }
>  
>  static void its_vpe_deschedule(struct its_vpe *vpe)
> @@ -3881,6 +3881,10 @@ static int its_vpe_set_vcpu_affinity(struct irq_data 
> *d, void *vcpu_info)
>   its_vpe_schedule(vpe);
>   return 0;
>  
> + case WAIT_VPT:
> + its_wait_vpt_parse_complete();
> + return 0;
> +
>   case DESCHEDULE_VPE:
>   its_vpe_deschedule(vpe);
>   return 0;
> @@ -4047,7 +4051,7 @@ static void its_vpe_4_1_schedule(struct its_vpe *vpe,
>  
>   gicr_write_vpendbaser(val, vlpi_base + GICR_VPENDBASER);
>  
> - its_wait_vpt_parse_complete();
> + vpe->vpt_ready = false;
>  }
>  
>  static void