From 382907b10077ed4cff48d9afe219a023887c0522 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Thu, 11 Feb 2021 19:52:40 -0300 Subject: spapr_drc.c: do not call spapr_drc_detach() in drc_isolate_logical() drc_isolate_logical() is used to move the DRC from the "Configured" to the "Available" state, erroring out if the DRC is in the unexpected "Unisolate" state and doing nothing (with RTAS_OUT_SUCCESS) if the DRC is already in "Available" or in "Unusable" state. When moving from "Configured" to "Available", the DRC is moved to the LOGICAL_AVAILABLE state, a drc->unplug_requested check is done and, if true, spapr_drc_detach() is called. What spapr_drc_detach() does then is: - set drc->unplug_requested to true. In fact, this is the only place where unplug_request is set to true; - does nothing else if drc->state != drck->empty_state. If the DRC state is equal to drck->empty_state, spapr_drc_release() is called. For logical DRCs, drck->empty_state = LOGICAL_UNUSABLE. In short, calling spapr_drc_detach() in drc_isolate_logical() does nothing. It'll set unplug_request to true again ('again' since it was already true - otherwise the function wouldn't be called), and will return without calling spapr_drc_release() because the DRC is not in LOGICAL_UNUSABLE, since drc_isolate_logical() just moved it to LOGICAL_AVAILABLE. The only place where the logical DRC is released is when called from drc_set_unusable(), when it is moved to the "Unusable" state. As it should, according to PAPR. Even though calling spapr_drc_detach() in drc_isolate_logical() is benign, removing it will avoid further thought about the matter. So let's go ahead and do that. As a note, this logic was introduced in commit bbf5c878ab76. Since then, the DRC handling code was refactored and enhanced, and PAPR itself went through some changes in the DRC area as well. It is expected that some assumptions we had back then are now deprecated. Signed-off-by: Daniel Henrique Barboza Message-Id: <20210211225246.17315-2-danielhb413@gmail.com> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr_drc.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 8571d5bafe..84bd3c881f 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -132,19 +132,6 @@ static uint32_t drc_isolate_logical(SpaprDrc *drc) drc->state = SPAPR_DRC_STATE_LOGICAL_AVAILABLE; - /* if we're awaiting release, but still in an unconfigured state, - * it's likely the guest is still in the process of configuring - * the device and is transitioning the devices to an ISOLATED - * state as a part of that process. so we only complete the - * removal when this transition happens for a device in a - * configured state, as suggested by the state diagram from PAPR+ - * 2.7, 13.4 - */ - if (drc->unplug_requested) { - uint32_t drc_index = spapr_drc_index(drc); - trace_spapr_drc_set_isolation_state_finalizing(drc_index); - spapr_drc_detach(drc); - } return RTAS_OUT_SUCCESS; } -- cgit v1.2.3-55-g7522 From 66d10d32ace18da3eacf2d8ee72c6076e87f9e72 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 22 Feb 2021 16:45:27 -0300 Subject: spapr_drc.c: use spapr_drc_release() in isolate_physical/set_unusable When moving a physical DRC to "Available", drc_isolate_physical() will move the DRC state to STATE_PHYSICAL_POWERON and, if the DRC is marked for unplug, call spapr_drc_detach(). For physical DRCs, drck->empty_state is STATE_PHYSICAL_POWERON, meaning that we're sure that spapr_drc_detach() will end up calling spapr_drc_release() in the end. Likewise, for logical DRCs, drc_set_unusable will move the DRC to "Unusable" state, setting drc->state to STATE_LOGICAL_UNUSABLE, which is the drck->empty_state for logical DRCs. spapr_drc_detach() will call spapr_drc_release() in this case as well. In both scenarios, spapr_drc_detach() is being used as a spapr_drc_release(), wrapper, where we also set unplug_requested (which is already true, otherwise spapr_drc_detach() wouldn't be called in the first place) and check if drc->state == drck->empty_state, which we also know it's guaranteed to be true because we just set it. Just use spapr_drc_release() in these functions to be clear of our intentions in both these functions. Reviewed-by: Greg Kurz Reviewed-by: David Gibson Signed-off-by: Daniel Henrique Barboza Message-Id: <20210222194531.62717-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_drc.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 84bd3c881f..555a25517d 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -50,6 +50,20 @@ uint32_t spapr_drc_index(SpaprDrc *drc) | (drc->id & DRC_INDEX_ID_MASK); } +static void spapr_drc_release(SpaprDrc *drc) +{ + SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); + + drck->release(drc->dev); + + drc->unplug_requested = false; + g_free(drc->fdt); + drc->fdt = NULL; + drc->fdt_start_offset = 0; + object_property_del(OBJECT(drc), "device"); + drc->dev = NULL; +} + static uint32_t drc_isolate_physical(SpaprDrc *drc) { switch (drc->state) { @@ -68,7 +82,7 @@ static uint32_t drc_isolate_physical(SpaprDrc *drc) if (drc->unplug_requested) { uint32_t drc_index = spapr_drc_index(drc); trace_spapr_drc_set_isolation_state_finalizing(drc_index); - spapr_drc_detach(drc); + spapr_drc_release(drc); } return RTAS_OUT_SUCCESS; @@ -209,7 +223,7 @@ static uint32_t drc_set_unusable(SpaprDrc *drc) if (drc->unplug_requested) { uint32_t drc_index = spapr_drc_index(drc); trace_spapr_drc_set_allocation_state_finalizing(drc_index); - spapr_drc_detach(drc); + spapr_drc_release(drc); } return RTAS_OUT_SUCCESS; @@ -372,20 +386,6 @@ void spapr_drc_attach(SpaprDrc *drc, DeviceState *d) NULL, 0); } -static void spapr_drc_release(SpaprDrc *drc) -{ - SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); - - drck->release(drc->dev); - - drc->unplug_requested = false; - g_free(drc->fdt); - drc->fdt = NULL; - drc->fdt_start_offset = 0; - object_property_del(OBJECT(drc), "device"); - drc->dev = NULL; -} - void spapr_drc_detach(SpaprDrc *drc) { SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); -- cgit v1.2.3-55-g7522 From a03509cd2baf48b1e947d9eb203ccb95bd99e5fb Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 22 Feb 2021 16:45:28 -0300 Subject: spapr: rename spapr_drc_detach() to spapr_drc_unplug_request() spapr_drc_detach() is not the best name for what the function does. The function does not detach the DRC, it makes an uncommited attempt to do it. It'll mark the DRC as pending unplug, via the 'unplug_request' flag, and only if the DRC state is drck->empty_state it will detach the DRC, via spapr_drc_release(). This is a contrast with its pair spapr_drc_attach(), where the function is indeed creating the DRC QOM object. If you know what spapr_drc_attach() does, you can be misled into thinking that spapr_drc_detach() is removing the DRC from QEMU internal state, which isn't true. The current role of this function is better described as a request for detach, since there's no guarantee that we're going to detach the DRC in the end. Rename the function to spapr_drc_unplug_request to reflect what is is doing. The initial idea was to change the name to spapr_drc_detach_request(), and later on change the unplug_request flag to detach_request. However, unplug_request is a migratable boolean for a long time now and renaming it is not worth the trouble. spapr_drc_unplug_request() setting drc->unplug_request is more natural than spapr_drc_detach_request setting drc->unplug_request. Reviewed-by: Greg Kurz Reviewed-by: David Gibson Signed-off-by: Daniel Henrique Barboza Message-Id: <20210222194531.62717-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 6 +++--- hw/ppc/spapr_drc.c | 4 ++-- hw/ppc/spapr_pci.c | 4 ++-- hw/ppc/trace-events | 2 +- include/hw/ppc/spapr_drc.h | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 85fe65f894..b066df68cb 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3654,7 +3654,7 @@ static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev, addr / SPAPR_MEMORY_BLOCK_SIZE); g_assert(drc); - spapr_drc_detach(drc); + spapr_drc_unplug_request(drc); addr += SPAPR_MEMORY_BLOCK_SIZE; } @@ -3722,7 +3722,7 @@ void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, g_assert(drc); if (!spapr_drc_unplug_requested(drc)) { - spapr_drc_detach(drc); + spapr_drc_unplug_request(drc); spapr_hotplug_req_remove_by_index(drc); } } @@ -3985,7 +3985,7 @@ static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev, assert(drc); if (!spapr_drc_unplug_requested(drc)) { - spapr_drc_detach(drc); + spapr_drc_unplug_request(drc); spapr_hotplug_req_remove_by_index(drc); } } diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 555a25517d..67041fb212 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -386,11 +386,11 @@ void spapr_drc_attach(SpaprDrc *drc, DeviceState *d) NULL, 0); } -void spapr_drc_detach(SpaprDrc *drc) +void spapr_drc_unplug_request(SpaprDrc *drc) { SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); - trace_spapr_drc_detach(spapr_drc_index(drc)); + trace_spapr_drc_unplug_request(spapr_drc_index(drc)); g_assert(drc->dev); diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index f1c7479816..b00e9609ae 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1723,12 +1723,12 @@ static void spapr_pci_unplug_request(HotplugHandler *plug_handler, * functions, even if their unplug weren't requested * beforehand. */ - spapr_drc_detach(func_drc); + spapr_drc_unplug_request(func_drc); } } } - spapr_drc_detach(drc); + spapr_drc_unplug_request(drc); /* if this isn't func 0, defer unplug event. otherwise signal removal * for all present functions diff --git a/hw/ppc/trace-events b/hw/ppc/trace-events index 1e91984526..b4bbfbb013 100644 --- a/hw/ppc/trace-events +++ b/hw/ppc/trace-events @@ -50,7 +50,7 @@ spapr_drc_set_allocation_state(uint32_t index, int state) "drc: 0x%"PRIx32", sta spapr_drc_set_allocation_state_finalizing(uint32_t index) "drc: 0x%"PRIx32 spapr_drc_set_configured(uint32_t index) "drc: 0x%"PRIx32 spapr_drc_attach(uint32_t index) "drc: 0x%"PRIx32 -spapr_drc_detach(uint32_t index) "drc: 0x%"PRIx32 +spapr_drc_unplug_request(uint32_t index) "drc: 0x%"PRIx32 spapr_drc_awaiting_quiesce(uint32_t index) "drc: 0x%"PRIx32 spapr_drc_reset(uint32_t index) "drc: 0x%"PRIx32 spapr_drc_realize(uint32_t index) "drc: 0x%"PRIx32 diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h index 8982927d5c..02a63b3666 100644 --- a/include/hw/ppc/spapr_drc.h +++ b/include/hw/ppc/spapr_drc.h @@ -243,7 +243,7 @@ int spapr_dt_drc(void *fdt, int offset, Object *owner, uint32_t drc_type_mask); * beforehand (eg. check drc->dev at pre-plug). */ void spapr_drc_attach(SpaprDrc *drc, DeviceState *d); -void spapr_drc_detach(SpaprDrc *drc); +void spapr_drc_unplug_request(SpaprDrc *drc); /* * Reset all DRCs, causing pending hot-plug/unplug requests to complete. -- cgit v1.2.3-55-g7522 From 51254ffb320183a4636635840c23ee0e3a1efffa Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 22 Feb 2021 16:45:29 -0300 Subject: spapr_drc.c: introduce unplug_timeout_timer The LoPAR spec provides no way for the guest kernel to report failure of hotplug/hotunplug events. This wouldn't be bad if those operations were granted to always succeed, but that's far for the reality. What ends up happening is that, in the case of a failed hotunplug, regardless of whether it was a QEMU error or a guest misbehavior, the pSeries machine is retaining the unplug state of the device in the running guest. This state is cleanup in machine reset, where it is assumed that this state represents a device that is pending unplug, and the device is hotunpluged from the board. Until the reset occurs, any hotunplug operation of the same device is forbid because there is a pending unplug state. This behavior has at least one undesirable side effect. A long standing pending unplug state is, more often than not, the result of a hotunplug error. The user had to dealt with it, since retrying to unplug the device is noy allowed, and then in the machine reset we're removing the device from the guest. This means that we're failing the user twice - failed to hotunplug when asked, then hotunplugged without notice. Solutions to this problem range between trying to predict when the hotunplug will fail and forbid the operation from the QEMU layer, from opening up the IRQ queue to allow for multiple hotunplug attempts, from telling the users to 'reboot the machine if something goes wrong'. The first solution is flawed because we can't fully predict guest behavior from QEMU, the second solution is a trial and error remediation that counts on a hope that the unplug will eventually succeed, and the third is ... well. This patch introduces a crude, but effective solution to hotunplug errors in the pSeries machine. For each unplug done, we'll timeout after some time. If a certain amount of time passes, we'll cleanup the hotunplug state from the machine. During the timeout period, any unplug operations in the same device will still be blocked. After that, we'll assume that the guest failed the operation, and allow the user to try again. If the timeout is too short we'll prevent legitimate hotunplug situations to occur, so we'll need to overestimate the regular time an unplug operation takes to succeed to account that. The true solution for the hotunplug errors in the pSeries machines is a PAPR change to allow for the guest to warn the platform about it. For now, the work done in this timeout design can be used for the new PAPR 'abort hcall' in the future, given that for both cases we'll need code to cleanup the existing unplug states of the DRCs. At this moment we're adding the basic wiring of the timer into the DRC. Next patch will use the timer to timeout failed CPU hotunplugs. Signed-off-by: Daniel Henrique Barboza Message-Id: <20210222194531.62717-4-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr_drc.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/hw/ppc/spapr_drc.h | 4 ++++ 2 files changed, 44 insertions(+) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 67041fb212..27adbc5c30 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -57,6 +57,8 @@ static void spapr_drc_release(SpaprDrc *drc) drck->release(drc->dev); drc->unplug_requested = false; + timer_del(drc->unplug_timeout_timer); + g_free(drc->fdt); drc->fdt = NULL; drc->fdt_start_offset = 0; @@ -370,6 +372,17 @@ static void prop_get_fdt(Object *obj, Visitor *v, const char *name, } while (fdt_depth != 0); } +static void spapr_drc_start_unplug_timeout_timer(SpaprDrc *drc) +{ + SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); + + if (drck->unplug_timeout_seconds != 0) { + timer_mod(drc->unplug_timeout_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + drck->unplug_timeout_seconds * 1000); + } +} + void spapr_drc_attach(SpaprDrc *drc, DeviceState *d) { trace_spapr_drc_attach(spapr_drc_index(drc)); @@ -475,11 +488,23 @@ static bool spapr_drc_needed(void *opaque) spapr_drc_unplug_requested(drc); } +static int spapr_drc_post_load(void *opaque, int version_id) +{ + SpaprDrc *drc = opaque; + + if (drc->unplug_requested) { + spapr_drc_start_unplug_timeout_timer(drc); + } + + return 0; +} + static const VMStateDescription vmstate_spapr_drc = { .name = "spapr_drc", .version_id = 1, .minimum_version_id = 1, .needed = spapr_drc_needed, + .post_load = spapr_drc_post_load, .fields = (VMStateField []) { VMSTATE_UINT32(state, SpaprDrc), VMSTATE_END_OF_LIST() @@ -490,6 +515,15 @@ static const VMStateDescription vmstate_spapr_drc = { } }; +static void drc_unplug_timeout_cb(void *opaque) +{ + SpaprDrc *drc = opaque; + + if (drc->unplug_requested) { + drc->unplug_requested = false; + } +} + static void drc_realize(DeviceState *d, Error **errp) { SpaprDrc *drc = SPAPR_DR_CONNECTOR(d); @@ -512,6 +546,11 @@ static void drc_realize(DeviceState *d, Error **errp) object_property_add_alias(root_container, link_name, drc->owner, child_name); g_free(link_name); + + drc->unplug_timeout_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + drc_unplug_timeout_cb, + drc); + vmstate_register(VMSTATE_IF(drc), spapr_drc_index(drc), &vmstate_spapr_drc, drc); trace_spapr_drc_realize_complete(spapr_drc_index(drc)); @@ -529,6 +568,7 @@ static void drc_unrealize(DeviceState *d) name = g_strdup_printf("%x", spapr_drc_index(drc)); object_property_del(root_container, name); g_free(name); + timer_free(drc->unplug_timeout_timer); } SpaprDrc *spapr_dr_connector_new(Object *owner, const char *type, diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h index 02a63b3666..38ec4c8091 100644 --- a/include/hw/ppc/spapr_drc.h +++ b/include/hw/ppc/spapr_drc.h @@ -187,6 +187,8 @@ typedef struct SpaprDrc { bool unplug_requested; void *fdt; int fdt_start_offset; + + QEMUTimer *unplug_timeout_timer; } SpaprDrc; struct SpaprMachineState; @@ -209,6 +211,8 @@ typedef struct SpaprDrcClass { int (*dt_populate)(SpaprDrc *drc, struct SpaprMachineState *spapr, void *fdt, int *fdt_start_offset, Error **errp); + + int unplug_timeout_seconds; } SpaprDrcClass; typedef struct SpaprDrcPhysical { -- cgit v1.2.3-55-g7522 From d1c2e3ce3d5a5424651967bce1cf1f4caa0c6d91 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 22 Feb 2021 16:45:30 -0300 Subject: spapr_drc.c: add hotunplug timeout for CPUs There is a reliable way to make a CPU hotunplug fail in the pseries machine. Hotplug a CPU A, then offline all other CPUs inside the guest but A. When trying to hotunplug A the guest kernel will refuse to do it, because A is now the last online CPU of the guest. PAPR has no 'error callback' in this situation to report back to the platform, so the guest kernel will deny the unplug in silent and QEMU will never know what happened. The unplug pending state of A will remain until the guest is shutdown or rebooted. Previous attempts of fixing it (see [1] and [2]) were aimed at trying to mitigate the effects of the problem. In [1] we were trying to guess which guest CPUs were online to forbid hotunplug of the last online CPU in the QEMU layer, avoiding the scenario described above because QEMU is now failing in behalf of the guest. This is not robust because the last online CPU of the guest can change while we're in the middle of the unplug process, and our initial assumptions are now invalid. In [2] we were accepting that our unplug process is uncertain and the user should be allowed to spam the IRQ hotunplug queue of the guest in case the CPU hotunplug fails. This patch presents another alternative, using the timeout infrastructure introduced in the previous patch. CPU hotunplugs in the pSeries machine will now timeout after 15 seconds. This is a long time for a single CPU unplug to occur, regardless of guest load - although the user is *strongly* encouraged to *not* hotunplug devices from a guest under high load - and we can be sure that something went wrong if it takes longer than that for the guest to release the CPU (the same can't be said about memory hotunplug - more on that in the next patch). Timing out the unplug operation will reset the unplug state of the CPU and allow the user to try it again, regardless of the error situation that prevented the hotunplug to occur. Of all the not so pretty fixes/mitigations for CPU hotunplug errors in pSeries, timing out the operation is an admission that we have no control in the process, and must assume the worst case if the operation doesn't succeed in a sensible time frame. [1] https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg03353.html [2] https://lists.gnu.org/archive/html/qemu-devel/2021-01/msg04400.html Reported-by: Xujun Ma Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1911414 Reviewed-by: David Gibson Signed-off-by: Daniel Henrique Barboza Message-Id: <20210222194531.62717-5-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 4 ++++ hw/ppc/spapr_drc.c | 13 +++++++++++++ include/hw/ppc/spapr_drc.h | 1 + 3 files changed, 18 insertions(+) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index b066df68cb..ecce8abf14 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3724,6 +3724,10 @@ void spapr_core_unplug_request(HotplugHandler *hotplug_dev, DeviceState *dev, if (!spapr_drc_unplug_requested(drc)) { spapr_drc_unplug_request(drc); spapr_hotplug_req_remove_by_index(drc); + } else { + error_setg(errp, "core-id %d unplug is still pending, %d seconds " + "timeout remaining", + cc->core_id, spapr_drc_unplug_timeout_remaining_sec(drc)); } } diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 27adbc5c30..fd2e45640f 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -409,6 +409,8 @@ void spapr_drc_unplug_request(SpaprDrc *drc) drc->unplug_requested = true; + spapr_drc_start_unplug_timeout_timer(drc); + if (drc->state != drck->empty_state) { trace_spapr_drc_awaiting_quiesce(spapr_drc_index(drc)); return; @@ -417,6 +419,16 @@ void spapr_drc_unplug_request(SpaprDrc *drc) spapr_drc_release(drc); } +int spapr_drc_unplug_timeout_remaining_sec(SpaprDrc *drc) +{ + if (drc->unplug_requested && timer_pending(drc->unplug_timeout_timer)) { + return (qemu_timeout_ns_to_ms(drc->unplug_timeout_timer->expire_time) - + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL)) / 1000; + } + + return 0; +} + bool spapr_drc_reset(SpaprDrc *drc) { SpaprDrcClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); @@ -710,6 +722,7 @@ static void spapr_drc_cpu_class_init(ObjectClass *k, void *data) drck->drc_name_prefix = "CPU "; drck->release = spapr_core_release; drck->dt_populate = spapr_core_dt_populate; + drck->unplug_timeout_seconds = 15; } static void spapr_drc_pci_class_init(ObjectClass *k, void *data) diff --git a/include/hw/ppc/spapr_drc.h b/include/hw/ppc/spapr_drc.h index 38ec4c8091..26599c385a 100644 --- a/include/hw/ppc/spapr_drc.h +++ b/include/hw/ppc/spapr_drc.h @@ -248,6 +248,7 @@ int spapr_dt_drc(void *fdt, int offset, Object *owner, uint32_t drc_type_mask); */ void spapr_drc_attach(SpaprDrc *drc, DeviceState *d); void spapr_drc_unplug_request(SpaprDrc *drc); +int spapr_drc_unplug_timeout_remaining_sec(SpaprDrc *drc); /* * Reset all DRCs, causing pending hot-plug/unplug requests to complete. -- cgit v1.2.3-55-g7522 From fe1831eff8a41e96044fe98a6b0e232daa22ef83 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 22 Feb 2021 16:45:31 -0300 Subject: spapr_drc.c: use DRC reconfiguration to cleanup DIMM unplug state Handling errors in memory hotunplug in the pSeries machine is more complex than any other device type, because there are all the complications that other devices has, and more. For instance, determining a timeout for a DIMM hotunplug must consider if it's a Hash-MMU or a Radix-MMU guest, because Hash guests takes longer to hotunplug DIMMs. The size of the DIMM is also a factor, given that longer DIMMs naturally takes longer to be hotunplugged from the kernel. And there's also the guest memory usage to be considered: if there's a process that is consuming memory that would be lost by the DIMM unplug, the kernel will postpone the unplug process until the process finishes, and then initiate the regular hotunplug process. The first two considerations are manageable, but the last one is a deal breaker. There is no sane way for the pSeries machine to determine the memory load in the guest when attempting a DIMM hotunplug - and even if there was a way, the guest can start using all the RAM in the middle of the unplug process and invalidate our previous assumptions - and in result we can't even begin to calculate a timeout for the operation. This means that we can't implement a viable timeout mechanism for memory unplug in pSeries. Going back to why we would consider an unplug timeout, the reason is that we can't know if the kernel is giving up the unplug. Turns out that, sometimes, we can. Consider a failed memory hotunplug attempt where the kernel will error out with the following message: 'pseries-hotplug-mem: Memory indexed-count-remove failed, adding any removed LMBs' This happens when there is a LMB that the kernel gave up in removing, and the LMBs previously marked for removal are now being added back. This happens in the pseries kernel in [1], dlpar_memory_remove_by_ic() into dlpar_add_lmb(), and after that update_lmb_associativity_index(). In this function, the kernel is configuring the LMB DRC connector again. Note that this is a valid usage in LOPAR, as stated in section "ibm,configure-connector RTAS Call": 'A subsequent sequence of calls to ibm,configure-connector with the same entry from the “ibm,drc-indexes” or “ibm,drc-info” property will restart the configuration of devices which were not completely configured.' We can use this kernel behavior in our favor. If a DRC connector reconfiguration for a LMB that we marked as unplug pending happens, this indicates that the kernel changed its mind about the unplug and is reasserting that it will keep using all the LMBs of the DIMM. In this case, it's safe to assume that the whole DIMM device unplug was cancelled. This patch hops into rtas_ibm_configure_connector() and, in the scenario described above, clear the unplug state for the DIMM device. This will not solve all the problems we still have with memory unplug, but it will cover this case where the kernel reconfigures LMBs after a failed unplug. We are a bit more resilient, without using an unreliable timeout, and we didn't make the remaining error cases any worse. [1] arch/powerpc/platforms/pseries/hotplug-memory.c Signed-off-by: Daniel Henrique Barboza Message-Id: <20210222194531.62717-6-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 43 +++++++++++++++++++++++++++++++++++++++++++ hw/ppc/spapr_drc.c | 10 ++++++++++ include/hw/ppc/spapr.h | 2 ++ 3 files changed, 55 insertions(+) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index ecce8abf14..6eaddb12cb 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3575,6 +3575,49 @@ static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms, return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm); } +void spapr_clear_pending_dimm_unplug_state(SpaprMachineState *spapr, + DeviceState *dev) +{ + SpaprDimmState *ds; + PCDIMMDevice *dimm; + SpaprDrc *drc; + uint32_t nr_lmbs; + uint64_t size, addr_start, addr; + int i; + + if (!dev) { + return; + } + + dimm = PC_DIMM(dev); + ds = spapr_pending_dimm_unplugs_find(spapr, dimm); + + /* + * 'ds == NULL' would mean that the DIMM doesn't have a pending + * unplug state, but one of its DRC is marked as unplug_requested. + * This is bad and weird enough to g_assert() out. + */ + g_assert(ds); + + spapr_pending_dimm_unplugs_remove(spapr, ds); + + size = memory_device_get_region_size(MEMORY_DEVICE(dimm), &error_abort); + nr_lmbs = size / SPAPR_MEMORY_BLOCK_SIZE; + + addr_start = object_property_get_uint(OBJECT(dimm), PC_DIMM_ADDR_PROP, + &error_abort); + + addr = addr_start; + for (i = 0; i < nr_lmbs; i++) { + drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, + addr / SPAPR_MEMORY_BLOCK_SIZE); + g_assert(drc); + + drc->unplug_requested = false; + addr += SPAPR_MEMORY_BLOCK_SIZE; + } +} + /* Callback to be called during DRC release. */ void spapr_lmb_release(DeviceState *dev) { diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index fd2e45640f..8c4997d795 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -1230,6 +1230,16 @@ static void rtas_ibm_configure_connector(PowerPCCPU *cpu, drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc); + /* + * This indicates that the kernel is reconfiguring a LMB due to + * a failed hotunplug. Clear the pending unplug state for the whole + * DIMM. + */ + if (spapr_drc_type(drc) == SPAPR_DR_CONNECTOR_TYPE_LMB && + drc->unplug_requested) { + spapr_clear_pending_dimm_unplug_state(spapr, drc->dev); + } + if (!drc->fdt) { void *fdt; int fdt_size; diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index ccbeeca1de..d6edeaaaff 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -847,6 +847,8 @@ int spapr_hpt_shift_for_ramsize(uint64_t ramsize); int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp); void spapr_clear_pending_events(SpaprMachineState *spapr); void spapr_clear_pending_hotplug_events(SpaprMachineState *spapr); +void spapr_clear_pending_dimm_unplug_state(SpaprMachineState *spapr, + DeviceState *dev); int spapr_max_server_number(SpaprMachineState *spapr); void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex, uint64_t pte0, uint64_t pte1); -- cgit v1.2.3-55-g7522 From e5943b00d35efc68ca72ed304cfca98a9f3a647c Mon Sep 17 00:00:00 2001 From: Bin Meng Date: Wed, 24 Feb 2021 17:28:39 +0800 Subject: hw/ppc: e500: Add missing in the eTSEC node The eTSEC node should provide an empty property in the eTSEC node, otherwise of_translate_address() in the Linux kernel fails to get the eTSEC register base, reporting: OF: ** translation for device /platform@f00000000/ethernet@0/queue-group ** OF: bus is default (na=1, ns=1) on /platform@f00000000/ethernet@0 OF: translating address: 00000000 OF: parent bus is default (na=1, ns=1) on /platform@f00000000 OF: no ranges; cannot translate Per devicetree spec v0.3 [1] chapter 2.3.8: If the property is not present in a bus node, it is assumed that no mapping exists between children of the node and the parent address space. This is why of_translate_address() aborts the address translation. Apparently U-Boot devicetree parser seems to be tolerant with missing as this was not noticed when testing with U-Boot. The empty property is present in all kernel shipped dtsi files for eTSEC, Let's add it to conform with the spec. [1] https://github.com/devicetree-org/devicetree-specification/releases/download/v0.3/devicetree-specification-v0.3.pdf Fixes: fdfb7f2cdb2d ("e500: Add support for eTSEC in device tree") Signed-off-by: Bin Meng Message-Id: <1614158919-9473-1-git-send-email-bmeng.cn@gmail.com> Signed-off-by: David Gibson --- hw/ppc/e500.c | 1 + 1 file changed, 1 insertion(+) (limited to 'hw/ppc') diff --git a/hw/ppc/e500.c b/hw/ppc/e500.c index 01517a6c6c..1d94485ac8 100644 --- a/hw/ppc/e500.c +++ b/hw/ppc/e500.c @@ -231,6 +231,7 @@ static int create_devtree_etsec(SysBusDevice *sbdev, PlatformDevtreeData *data) assert(irq2 >= 0); qemu_fdt_add_subnode(fdt, node); + qemu_fdt_setprop(fdt, node, "ranges", NULL, 0); qemu_fdt_setprop_string(fdt, node, "device_type", "network"); qemu_fdt_setprop_string(fdt, node, "compatible", "fsl,etsec2"); qemu_fdt_setprop_string(fdt, node, "model", "eTSEC"); -- cgit v1.2.3-55-g7522 From 7420033ec460e7b9906bf05fbe1a0d3830536657 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Fri, 26 Feb 2021 13:32:59 -0300 Subject: spapr.c: add 'unplug already in progress' message for PHB unplug Both CPU hotunplug and PC_DIMM unplug reports an user warning, mentioning that the hotunplug is in progress, if consecutive 'device_del' are issued in quick succession. Do the same for PHBs in spapr_phb_unplug_request(). Signed-off-by: Daniel Henrique Barboza Message-Id: <20210226163301.419727-4-danielhb413@gmail.com> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index 6eaddb12cb..aca3ef9d58 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -4034,6 +4034,10 @@ static void spapr_phb_unplug_request(HotplugHandler *hotplug_dev, if (!spapr_drc_unplug_requested(drc)) { spapr_drc_unplug_request(drc); spapr_hotplug_req_remove_by_index(drc); + } else { + error_setg(errp, + "PCI Host Bridge unplug already in progress for device %s", + dev->id); } } -- cgit v1.2.3-55-g7522 From e35dfbd22780aafbcd4b6da5130a00fc085fd5de Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Fri, 26 Feb 2021 13:33:00 -0300 Subject: spapr_pci.c: add 'unplug already in progress' message for PCI unplug Hotunplug for all other devices are warning the user when the hotunplug is already in progress. Do the same for PCI devices in spapr_pci_unplug_request(). Signed-off-by: Daniel Henrique Barboza Message-Id: <20210226163301.419727-5-danielhb413@gmail.com> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr_pci.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c index b00e9609ae..feba18cb12 100644 --- a/hw/ppc/spapr_pci.c +++ b/hw/ppc/spapr_pci.c @@ -1743,6 +1743,10 @@ static void spapr_pci_unplug_request(HotplugHandler *plug_handler, } } } + } else { + error_setg(errp, + "PCI device unplug already in progress for device %s", + drc->dev->id); } } -- cgit v1.2.3-55-g7522 From 4515a5f786024fabf0bef4cf3d28adf5647e6e82 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Mon, 1 Mar 2021 09:41:33 -0300 Subject: qemu_timer.c: add timer_deadline_ms() helper The pSeries machine is using QEMUTimer internals to return the timeout in seconds for a timer object, in hw/ppc/spapr.c, function spapr_drc_unplug_timeout_remaining_sec(). Create a helper in qemu-timer.c to retrieve the deadline for a QEMUTimer object, in ms, to avoid exposing timer internals to the PPC code. CC: Paolo Bonzini Acked-by: Paolo Bonzini Signed-off-by: Daniel Henrique Barboza Message-Id: <20210301124133.23800-2-danielhb413@gmail.com> Reviewed-by: Greg Kurz Signed-off-by: David Gibson --- hw/ppc/spapr_drc.c | 5 ++--- include/qemu/timer.h | 8 ++++++++ util/qemu-timer.c | 13 +++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 8c4997d795..98b626acf9 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -421,9 +421,8 @@ void spapr_drc_unplug_request(SpaprDrc *drc) int spapr_drc_unplug_timeout_remaining_sec(SpaprDrc *drc) { - if (drc->unplug_requested && timer_pending(drc->unplug_timeout_timer)) { - return (qemu_timeout_ns_to_ms(drc->unplug_timeout_timer->expire_time) - - qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL)) / 1000; + if (drc->unplug_requested) { + return timer_deadline_ms(drc->unplug_timeout_timer) / 1000; } return 0; diff --git a/include/qemu/timer.h b/include/qemu/timer.h index 1678238384..5e76e3f8c2 100644 --- a/include/qemu/timer.h +++ b/include/qemu/timer.h @@ -795,6 +795,14 @@ static inline int64_t get_max_clock_jump(void) return 60 * NANOSECONDS_PER_SECOND; } +/** + * timer_deadline_ms: + * + * Returns the remaining miliseconds for @timer to expire, or zero + * if the timer is no longer pending. + */ +int64_t timer_deadline_ms(QEMUTimer *timer); + /* * Low level clock functions */ diff --git a/util/qemu-timer.c b/util/qemu-timer.c index 81c28af517..02424bc1b6 100644 --- a/util/qemu-timer.c +++ b/util/qemu-timer.c @@ -243,6 +243,19 @@ int64_t timerlist_deadline_ns(QEMUTimerList *timer_list) return delta; } +/* + * Returns the time remaining for the deadline, in ms. + */ +int64_t timer_deadline_ms(QEMUTimer *timer) +{ + if (timer_pending(timer)) { + return qemu_timeout_ns_to_ms(timer->expire_time) - + qemu_clock_get_ms(timer->timer_list->clock->type); + } + + return 0; +} + /* Calculate the soonest deadline across all timerlists attached * to the clock. This is used for the icount timeout so we * ignore whether or not the clock should be used in deadline -- cgit v1.2.3-55-g7522 From 41c8ad3d920d6f1741b34bfdfaa72b43b45209b5 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 2 Mar 2021 11:10:18 -0300 Subject: spapr.c: remove duplicated assert in spapr_memory_unplug_request() We are asserting the existence of the first DRC LMB after sending unplug requests to all LMBs of the DIMM, where every DRC is being asserted inside the loop. This means that the first DRC is being asserted twice. Remove the duplicated assert. Signed-off-by: Daniel Henrique Barboza Message-Id: <20210302141019.153729-2-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index aca3ef9d58..b579830832 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -3703,7 +3703,6 @@ static void spapr_memory_unplug_request(HotplugHandler *hotplug_dev, drc = spapr_drc_by_id(TYPE_SPAPR_DRC_LMB, addr_start / SPAPR_MEMORY_BLOCK_SIZE); - g_assert(drc); spapr_hotplug_req_remove_by_count_indexed(SPAPR_DR_CONNECTOR_TYPE_LMB, nr_lmbs, spapr_drc_index(drc)); } -- cgit v1.2.3-55-g7522 From eb7f80fd26d73e7e1af105431da58971b1dba517 Mon Sep 17 00:00:00 2001 From: Daniel Henrique Barboza Date: Tue, 2 Mar 2021 11:10:19 -0300 Subject: spapr.c: send QAPI event when memory hotunplug fails Recent changes allowed the pSeries machine to rollback the hotunplug process for the DIMM when the guest kernel signals, via a reconfiguration of the DR connector, that it's not going to release the LMBs. Let's also warn QAPI listerners about it. One place to do it would be right after the unplug state is cleaned up, spapr_clear_pending_dimm_unplug_state(). This would mean that the function is now doing more than cleaning up the pending dimm state though. This patch does the following changes in spapr.c: - send a QAPI event to inform that we experienced a failure in the hotunplug of the DIMM; - rename spapr_clear_pending_dimm_unplug_state() to spapr_memory_unplug_rollback(). This is a better fit for what the function is now doing, and it makes callers care more about what the function goal is and less about spapr.c internals such as clearing the pending dimm unplug state. Reviewed-by: Greg Kurz Signed-off-by: Daniel Henrique Barboza Message-Id: <20210302141019.153729-3-danielhb413@gmail.com> Signed-off-by: David Gibson --- hw/ppc/spapr.c | 13 +++++++++++-- hw/ppc/spapr_drc.c | 5 ++--- include/hw/ppc/spapr.h | 3 +-- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'hw/ppc') diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c index b579830832..d56418ca29 100644 --- a/hw/ppc/spapr.c +++ b/hw/ppc/spapr.c @@ -28,6 +28,7 @@ #include "qemu-common.h" #include "qemu/datadir.h" #include "qapi/error.h" +#include "qapi/qapi-events-machine.h" #include "qapi/visitor.h" #include "sysemu/sysemu.h" #include "sysemu/hostmem.h" @@ -3575,14 +3576,14 @@ static SpaprDimmState *spapr_recover_pending_dimm_state(SpaprMachineState *ms, return spapr_pending_dimm_unplugs_add(ms, avail_lmbs, dimm); } -void spapr_clear_pending_dimm_unplug_state(SpaprMachineState *spapr, - DeviceState *dev) +void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev) { SpaprDimmState *ds; PCDIMMDevice *dimm; SpaprDrc *drc; uint32_t nr_lmbs; uint64_t size, addr_start, addr; + g_autofree char *qapi_error = NULL; int i; if (!dev) { @@ -3616,6 +3617,14 @@ void spapr_clear_pending_dimm_unplug_state(SpaprMachineState *spapr, drc->unplug_requested = false; addr += SPAPR_MEMORY_BLOCK_SIZE; } + + /* + * Tell QAPI that something happened and the memory + * hotunplug wasn't successful. + */ + qapi_error = g_strdup_printf("Memory hotunplug rejected by the guest " + "for device %s", dev->id); + qapi_event_send_mem_unplug_error(dev->id, qapi_error); } /* Callback to be called during DRC release. */ diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c index 98b626acf9..8a71b03800 100644 --- a/hw/ppc/spapr_drc.c +++ b/hw/ppc/spapr_drc.c @@ -1231,12 +1231,11 @@ static void rtas_ibm_configure_connector(PowerPCCPU *cpu, /* * This indicates that the kernel is reconfiguring a LMB due to - * a failed hotunplug. Clear the pending unplug state for the whole - * DIMM. + * a failed hotunplug. Rollback the DIMM unplug process. */ if (spapr_drc_type(drc) == SPAPR_DR_CONNECTOR_TYPE_LMB && drc->unplug_requested) { - spapr_clear_pending_dimm_unplug_state(spapr, drc->dev); + spapr_memory_unplug_rollback(spapr, drc->dev); } if (!drc->fdt) { diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h index d6edeaaaff..47cebaf3ac 100644 --- a/include/hw/ppc/spapr.h +++ b/include/hw/ppc/spapr.h @@ -847,8 +847,7 @@ int spapr_hpt_shift_for_ramsize(uint64_t ramsize); int spapr_reallocate_hpt(SpaprMachineState *spapr, int shift, Error **errp); void spapr_clear_pending_events(SpaprMachineState *spapr); void spapr_clear_pending_hotplug_events(SpaprMachineState *spapr); -void spapr_clear_pending_dimm_unplug_state(SpaprMachineState *spapr, - DeviceState *dev); +void spapr_memory_unplug_rollback(SpaprMachineState *spapr, DeviceState *dev); int spapr_max_server_number(SpaprMachineState *spapr); void spapr_store_hpte(PowerPCCPU *cpu, hwaddr ptex, uint64_t pte0, uint64_t pte1); -- cgit v1.2.3-55-g7522