Re: [PATCH 4/5] ppc/spapr: Don't kill the guest if a recovered FWNMI machine check delivery fails

2020-03-18 Thread Nicholas Piggin
Greg Kurz's on March 18, 2020 2:57 am:
> On Tue, 17 Mar 2020 15:02:14 +1000
> Nicholas Piggin  wrote:
> 
>> Try to be tolerant of errors if the machine check had been recovered
>> by the host.
>> 
>> Signed-off-by: Nicholas Piggin 
>> ---
> 
> Same comment as previous patch on multi-line error strings and
> warn_report() in the !recovered case.
> 
>>  hw/ppc/spapr_events.c | 25 ++---
>>  1 file changed, 18 insertions(+), 7 deletions(-)
>> 
>> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
>> index d35151eeb0..3f524cb0ca 100644
>> --- a/hw/ppc/spapr_events.c
>> +++ b/hw/ppc/spapr_events.c
>> @@ -807,13 +807,20 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, 
>> bool recovered)
>>  /* get rtas addr from fdt */
>>  rtas_addr = spapr_get_rtas_addr();
>>  if (!rtas_addr) {
>> -warn_report("FWNMI: Unable to deliver machine check to guest: "
>> -"rtas_addr not found.");
>> -qemu_system_guest_panicked(NULL);
>> +if (!recovered) {
>> +warn_report("FWNMI: Unable to deliver machine check to guest: "
>> +"rtas_addr not found.");
>> +qemu_system_guest_panicked(NULL);
>> +} else {
>> +warn_report("FWNMI: Unable to deliver machine check to guest: "
>> +"rtas_addr not found. Machine check recovered.");
>> +}
>>  g_free(ext_elog);
>>  return;
>>  }
>>  
>> +spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
>> +
> 
> I don't understand this change.

If we bail out without delivering the interrupt, we can't take the
interlock otherwise the guest can never release it.

Thanks,
Nick



Re: [PATCH 4/5] ppc/spapr: Don't kill the guest if a recovered FWNMI machine check delivery fails

2020-03-17 Thread Greg Kurz
On Tue, 17 Mar 2020 15:02:14 +1000
Nicholas Piggin  wrote:

> Try to be tolerant of errors if the machine check had been recovered
> by the host.
> 
> Signed-off-by: Nicholas Piggin 
> ---

Same comment as previous patch on multi-line error strings and
warn_report() in the !recovered case.

>  hw/ppc/spapr_events.c | 25 ++---
>  1 file changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
> index d35151eeb0..3f524cb0ca 100644
> --- a/hw/ppc/spapr_events.c
> +++ b/hw/ppc/spapr_events.c
> @@ -807,13 +807,20 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, 
> bool recovered)
>  /* get rtas addr from fdt */
>  rtas_addr = spapr_get_rtas_addr();
>  if (!rtas_addr) {
> -warn_report("FWNMI: Unable to deliver machine check to guest: "
> -"rtas_addr not found.");
> -qemu_system_guest_panicked(NULL);
> +if (!recovered) {
> +warn_report("FWNMI: Unable to deliver machine check to guest: "
> +"rtas_addr not found.");
> +qemu_system_guest_panicked(NULL);
> +} else {
> +warn_report("FWNMI: Unable to deliver machine check to guest: "
> +"rtas_addr not found. Machine check recovered.");
> +}
>  g_free(ext_elog);
>  return;
>  }
>  
> +spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
> +

I don't understand this change.

>  stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET,
>  env->gpr[3]);
>  cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET +
> @@ -850,9 +857,14 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>   * that CPU called "ibm,nmi-interlock")
>   */
>  if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) {
> -warn_report("FWNMI: Unable to deliver machine check to guest: "
> -"nested machine check.");
> -qemu_system_guest_panicked(NULL);
> +if (!recovered) {
> +warn_report("FWNMI: Unable to deliver machine check to 
> guest: "
> +"nested machine check.");
> +qemu_system_guest_panicked(NULL);
> +} else {
> +warn_report("FWNMI: Unable to deliver machine check to 
> guest: "
> +"nested machine check. Machine check 
> recovered.");
> +}
>  return;
>  }
>  qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond);
> @@ -880,7 +892,6 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
>  warn_report("Received a fwnmi while migration was in progress");
>  }
>  
> -spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
>  spapr_mce_dispatch_elog(cpu, recovered);
>  }
>  




[PATCH 4/5] ppc/spapr: Don't kill the guest if a recovered FWNMI machine check delivery fails

2020-03-16 Thread Nicholas Piggin
Try to be tolerant of errors if the machine check had been recovered
by the host.

Signed-off-by: Nicholas Piggin 
---
 hw/ppc/spapr_events.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index d35151eeb0..3f524cb0ca 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -807,13 +807,20 @@ static void spapr_mce_dispatch_elog(PowerPCCPU *cpu, bool 
recovered)
 /* get rtas addr from fdt */
 rtas_addr = spapr_get_rtas_addr();
 if (!rtas_addr) {
-warn_report("FWNMI: Unable to deliver machine check to guest: "
-"rtas_addr not found.");
-qemu_system_guest_panicked(NULL);
+if (!recovered) {
+warn_report("FWNMI: Unable to deliver machine check to guest: "
+"rtas_addr not found.");
+qemu_system_guest_panicked(NULL);
+} else {
+warn_report("FWNMI: Unable to deliver machine check to guest: "
+"rtas_addr not found. Machine check recovered.");
+}
 g_free(ext_elog);
 return;
 }
 
+spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
+
 stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET,
 env->gpr[3]);
 cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET +
@@ -850,9 +857,14 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
  * that CPU called "ibm,nmi-interlock")
  */
 if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) {
-warn_report("FWNMI: Unable to deliver machine check to guest: "
-"nested machine check.");
-qemu_system_guest_panicked(NULL);
+if (!recovered) {
+warn_report("FWNMI: Unable to deliver machine check to guest: "
+"nested machine check.");
+qemu_system_guest_panicked(NULL);
+} else {
+warn_report("FWNMI: Unable to deliver machine check to guest: "
+"nested machine check. Machine check recovered.");
+}
 return;
 }
 qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond);
@@ -880,7 +892,6 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
 warn_report("Received a fwnmi while migration was in progress");
 }
 
-spapr->fwnmi_machine_check_interlock = cpu->vcpu_id;
 spapr_mce_dispatch_elog(cpu, recovered);
 }
 
-- 
2.23.0