On 2/16/21 11:31 PM, David Gibson wrote:
On Thu, Feb 11, 2021 at 07:52:46PM -0300, Daniel Henrique Barboza wrote:
Handling errors in memory hotunplug in the pSeries machine is more complex
than any other device type, because there are all the complications that other
devices has, and more.
[...]
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index ecce8abf14..4bcded4a1a 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3575,6 +3575,36 @@ static SpaprDimmState
*spapr_recover_pending_dimm_state(SpaprMachineState *ms,
return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm);
}
+void spapr_clear_pending_dimm_unplug_state(SpaprMachineState *spapr,
+ PCDIMMDevice *dimm)
+{
+ SpaprDimmState *ds = spapr_pending_dimm_unplugs_find(spapr, dimm);
+ SpaprDrc *drc;
+ uint32_t nr_lmbs;
+ uint64_t size, addr_start, addr;
+ int i;
+
+ if (ds) {
+ spapr_pending_dimm_unplugs_remove(spapr, ds);
+ }
Hrm... how would !ds arise? Could this just be an assert?
!ds would appear if we do not assert g_assert(drc->dev) down there, where you
suggested down below that a malicious/buggy code would trigger it, for example.
With that assert in place then this less likely to occcur.
I guess what I can do here is:
- remove the g_assert(drc->dev) from down below, since it's more related to the
logic of this function;
- here, check if drc->dev is NULL. Return doing nothing if that's the case (all
the
function relies on drc->dev being valid);
- if drc->dev is not NULL, then we can g_assert(ds) and proceed with the rest of
the function
This way we become a little more tolerant on drc->dev being NULL, but if
drc->dev
is valid we will expect a unplug dimm state to always exist and assert it.
Thanks,
DHB
+
+ size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort);
+ nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE;
+
+ addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP,
+ &error_abort);
+
+ addr = addr_start;
+ for (i = 0; i < nr_lmbs; i++) {
+ drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB,
+ addr / SPAPR_MEMORY_BLOCK_SIZE);
+ g_assert(drc);
+
+ drc->unplug_requested = false;
+ addr += SPAPR_MEMORY_BLOCK_SIZE;
+ }
+}
+
/* Callback to be called during DRC release. */
void spapr_lmb_release(DeviceState *dev)
{
diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index c143bfb6d3..eae941233a 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -1230,6 +1230,20 @@ static void rtas_ibm_configure_connector(PowerPCCPU *cpu,
drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+ /*
+ * This indicates that the kernel is reconfiguring a LMB due to
+ * a failed hotunplug. Clear the pending unplug state for the whole
+ * DIMM.
+ */
+ if (spapr_drc_type(drc) == SPAPR_DR_CONNECTOR_TYPE_LMB &&
+ drc->unplug_requested) {
+
+ /* This really shouldn't happen in this point, but ... */
+ g_assert(drc->dev);
I'm a little worried that a buggy or malicious guest could trigger
this assert.
+
+ spapr_clear_pending_dimm_unplug_state(spapr, PC_DIMM(drc->dev));
+ }
+
if (!drc->fdt) {
void *fdt;
int fdt_size;
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index ccbeeca1de..5bcc8f3bb8 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -847,6 +847,8 @@ int spapr_hpt_shift_for_ramsize(uint64_t ramsize);
int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp);
void spapr_clear_pending_events(SpaprMachineState *spapr);
void spapr_clear_pending_hotplug_events(SpaprMachineState *spapr);
+void spapr_clear_pending_dimm_unplug_state(SpaprMachineState *spapr,
+ PCDIMMDevice *dimm);
int spapr_max_server_number(SpaprMachineState *spapr);
void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex,
uint64_t pte0, uint64_t pte1);