On Thu, Jun 26, 2025 at 02:27:50PM -0700, Daniele Ceraolo Spurio wrote: > > > On 6/26/2025 10:24 AM, Rodrigo Vivi wrote: > > On Wed, Jun 25, 2025 at 10:30:10PM +0530, Badal Nilawar wrote: > > > Load late binding firmware > > > > > > v2: > > > - s/EAGAIN/EBUSY/ > > > - Flush worker in suspend and driver unload (Daniele) > > > v3: > > > - Use retry interval of 6s, in steps of 200ms, to allow > > > other OS components release MEI CL handle (Sasha) > > > v4: > > > - return -ENODEV if component not added (Daniele) > > > - parse and print status returned by csc > > > - Use xe_pm_get_if_in_active (Daniele) > > The worker is considered outer bound and it is safe > > to use xe_pm_runtime_get which takes the reference > > and resume synchronously. > > > > Otherwise, if using get_if_active you need to reschedule > > the work or you lose your job. > > The issue is that the next patch adds code to re-queue the work from the rpm > resume path, so if we do a sync resume here the worker will re-queue itself > immediately when not needed.
ops, I had forgotten about that case, I'm sorry. > Also, when the re-queued work runs it might end > up doing another sync resume and re-queuing itself once more. I believe it might be worse than that and even hang. This is the right case for the if_active indeed. But we need to ensure that we will always have an outer bound for that. > However, in > the next patch we do also have a flush of the work in the rpm_suspend path, > so maybe the worker running when we are rpm suspended is not actually a > possible case? that's the kaboom case! > Also, thinking about this more, that re-queuing on rpm resume only happens > if d3cold is allowed, so when d3cold is not allowed we do want to proceed > here we can actually reach here when rpm suspended. no, when d3cold is not allowed we don't want to re-flash the fw. We just skip and move forward. My bad, sorry for the noise and please keep the if_active variant in here. > > > > > > Signed-off-by: Badal Nilawar <badal.nila...@intel.com> > > > --- > > > drivers/gpu/drm/xe/xe_late_bind_fw.c | 149 ++++++++++++++++++++- > > > drivers/gpu/drm/xe/xe_late_bind_fw.h | 1 + > > > drivers/gpu/drm/xe/xe_late_bind_fw_types.h | 7 + > > > 3 files changed, 156 insertions(+), 1 deletion(-) > > > > > > diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.c > > > b/drivers/gpu/drm/xe/xe_late_bind_fw.c > > > index 32d1436e7191..52243063d98a 100644 > > > --- a/drivers/gpu/drm/xe/xe_late_bind_fw.c > > > +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.c > > > @@ -16,6 +16,20 @@ > > > #include "xe_late_bind_fw.h" > > > #include "xe_pcode.h" > > > #include "xe_pcode_api.h" > > > +#include "xe_pm.h" > > > + > > > +/* > > > + * The component should load quite quickly in most cases, but it could > > > take > > > + * a bit. Using a very big timeout just to cover the worst case scenario > > > + */ > > > +#define LB_INIT_TIMEOUT_MS 20000 > > > + > > > +/* > > > + * Retry interval set to 6 seconds, in steps of 200 ms, to allow time for > > > + * other OS components to release the MEI CL handle > > > + */ > > > +#define LB_FW_LOAD_RETRY_MAXCOUNT 30 > > > +#define LB_FW_LOAD_RETRY_PAUSE_MS 200 > > > static const u32 fw_id_to_type[] = { > > > [XE_LB_FW_FAN_CONTROL] = > > > CSC_LATE_BINDING_TYPE_FAN_CONTROL, > > > @@ -31,6 +45,30 @@ late_bind_to_xe(struct xe_late_bind *late_bind) > > > return container_of(late_bind, struct xe_device, late_bind); > > > } > > > +static const char *xe_late_bind_parse_status(uint32_t status) > > > +{ > > > + switch (status) { > > > + case CSC_LATE_BINDING_STATUS_SUCCESS: > > > + return "success"; > > > + case CSC_LATE_BINDING_STATUS_4ID_MISMATCH: > > > + return "4Id Mismatch"; > > > + case CSC_LATE_BINDING_STATUS_ARB_FAILURE: > > > + return "ARB Failure"; > > > + case CSC_LATE_BINDING_STATUS_GENERAL_ERROR: > > > + return "General Error"; > > > + case CSC_LATE_BINDING_STATUS_INVALID_PARAMS: > > > + return "Invalid Params"; > > > + case CSC_LATE_BINDING_STATUS_INVALID_SIGNATURE: > > > + return "Invalid Signature"; > > > + case CSC_LATE_BINDING_STATUS_INVALID_PAYLOAD: > > > + return "Invalid Payload"; > > > + case CSC_LATE_BINDING_STATUS_TIMEOUT: > > > + return "Timeout"; > > > + default: > > > + return "Unknown error"; > > > + } > > > +} > > > + > > > static int xe_late_bind_fw_num_fans(struct xe_late_bind *late_bind) > > > { > > > struct xe_device *xe = late_bind_to_xe(late_bind); > > > @@ -44,6 +82,93 @@ static int xe_late_bind_fw_num_fans(struct > > > xe_late_bind *late_bind) > > > return 0; > > > } > > > +static void xe_late_bind_wait_for_worker_completion(struct xe_late_bind > > > *late_bind) > > > +{ > > > + struct xe_device *xe = late_bind_to_xe(late_bind); > > > + struct xe_late_bind_fw *lbfw; > > > + int fw_id; > > > + > > > + for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) { > > > + lbfw = &late_bind->late_bind_fw[fw_id]; > > > + if (lbfw->valid && late_bind->wq) { > > > + drm_dbg(&xe->drm, "Flush work: load %s firmware\n", > > > + fw_id_to_name[lbfw->id]); > > > + flush_work(&lbfw->work); > > > + } > > > + } > > > +} > > > + > > > +static void xe_late_bind_work(struct work_struct *work) > > > +{ > > > + struct xe_late_bind_fw *lbfw = container_of(work, struct > > > xe_late_bind_fw, work); > > > + struct xe_late_bind *late_bind = container_of(lbfw, struct xe_late_bind, > > > + late_bind_fw[lbfw->id]); > > > + struct xe_device *xe = late_bind_to_xe(late_bind); > > > + int retry = LB_FW_LOAD_RETRY_MAXCOUNT; > > > + int ret; > > > + int slept; > > > + > > > + /* we can queue this before the component is bound */ > > > + for (slept = 0; slept < LB_INIT_TIMEOUT_MS; slept += 100) { > > > + if (late_bind->component.ops) > > > + break; > > > + msleep(100); > > > + } > > > + > > > + if (!xe_pm_runtime_get_if_active(xe)) > > > + return; > > > + > > > + mutex_lock(&late_bind->mutex); > > > + > > > + if (!late_bind->component.ops) { > > > + drm_err(&xe->drm, "Late bind component not bound\n"); > > > + goto out; > > > + } > > > + > > > + drm_dbg(&xe->drm, "Load %s firmware\n", fw_id_to_name[lbfw->id]); > > > + > > > + do { > > > + ret = > > > late_bind->component.ops->push_config(late_bind->component.mei_dev, > > > + lbfw->type, > > > lbfw->flags, > > > + lbfw->payload, > > > lbfw->payload_size); > > > + if (!ret) > > > + break; > > > + msleep(LB_FW_LOAD_RETRY_PAUSE_MS); > > > + } while (--retry && ret == -EBUSY); > > > + > > > + if (!ret) { > > > + drm_dbg(&xe->drm, "Load %s firmware successful\n", > > > + fw_id_to_name[lbfw->id]); > > > + goto out; > > > + } > > > + > > > + if (ret > 0) > > nit: here you can just do "else if" and drop the goto. > > Daniele > > > > + drm_err(&xe->drm, "Load %s firmware failed with err %d, %s\n", > > > + fw_id_to_name[lbfw->id], ret, > > > xe_late_bind_parse_status(ret)); > > > + else > > > + drm_err(&xe->drm, "Load %s firmware failed with err %d", > > > + fw_id_to_name[lbfw->id], ret); > > > +out: > > > + mutex_unlock(&late_bind->mutex); > > > + xe_pm_runtime_put(xe); > > > +} > > > + > > > +int xe_late_bind_fw_load(struct xe_late_bind *late_bind) > > > +{ > > > + struct xe_late_bind_fw *lbfw; > > > + int fw_id; > > > + > > > + if (!late_bind->component_added) > > > + return -ENODEV; > > > + > > > + for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) { > > > + lbfw = &late_bind->late_bind_fw[fw_id]; > > > + if (lbfw->valid) > > > + queue_work(late_bind->wq, &lbfw->work); > > > + } > > > + return 0; > > > +} > > > + > > > static int __xe_late_bind_fw_init(struct xe_late_bind *late_bind, u32 > > > fw_id) > > > { > > > struct xe_device *xe = late_bind_to_xe(late_bind); > > > @@ -99,6 +224,7 @@ static int __xe_late_bind_fw_init(struct xe_late_bind > > > *late_bind, u32 fw_id) > > > memcpy(lb_fw->payload, fw->data, lb_fw->payload_size); > > > release_firmware(fw); > > > + INIT_WORK(&lb_fw->work, xe_late_bind_work); > > > lb_fw->valid = true; > > > return 0; > > > @@ -109,11 +235,16 @@ static int xe_late_bind_fw_init(struct xe_late_bind > > > *late_bind) > > > int ret; > > > int fw_id; > > > + late_bind->wq = alloc_ordered_workqueue("late-bind-ordered-wq", 0); > > > + if (!late_bind->wq) > > > + return -ENOMEM; > > > + > > > for (fw_id = 0; fw_id < XE_LB_FW_MAX_ID; fw_id++) { > > > ret = __xe_late_bind_fw_init(late_bind, fw_id); > > > if (ret) > > > return ret; > > > } > > > + > > > return 0; > > > } > > > @@ -137,6 +268,8 @@ static void xe_late_bind_component_unbind(struct > > > device *xe_kdev, > > > struct xe_device *xe = kdev_to_xe_device(xe_kdev); > > > struct xe_late_bind *late_bind = &xe->late_bind; > > > + xe_late_bind_wait_for_worker_completion(late_bind); > > > + > > > mutex_lock(&late_bind->mutex); > > > late_bind->component.ops = NULL; > > > mutex_unlock(&late_bind->mutex); > > > @@ -152,7 +285,15 @@ static void xe_late_bind_remove(void *arg) > > > struct xe_late_bind *late_bind = arg; > > > struct xe_device *xe = late_bind_to_xe(late_bind); > > > + xe_late_bind_wait_for_worker_completion(late_bind); > > > + > > > + late_bind->component_added = false; > > > + > > > component_del(xe->drm.dev, &xe_late_bind_component_ops); > > > + if (late_bind->wq) { > > > + destroy_workqueue(late_bind->wq); > > > + late_bind->wq = NULL; > > > + } > > > mutex_destroy(&late_bind->mutex); > > > } > > > @@ -183,9 +324,15 @@ int xe_late_bind_init(struct xe_late_bind *late_bind) > > > return err; > > > } > > > + late_bind->component_added = true; > > > + > > > err = devm_add_action_or_reset(xe->drm.dev, > > > xe_late_bind_remove, late_bind); > > > if (err) > > > return err; > > > - return xe_late_bind_fw_init(late_bind); > > > + err = xe_late_bind_fw_init(late_bind); > > > + if (err) > > > + return err; > > > + > > > + return xe_late_bind_fw_load(late_bind); > > > } > > > diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw.h > > > b/drivers/gpu/drm/xe/xe_late_bind_fw.h > > > index 4c73571c3e62..28d56ed2bfdc 100644 > > > --- a/drivers/gpu/drm/xe/xe_late_bind_fw.h > > > +++ b/drivers/gpu/drm/xe/xe_late_bind_fw.h > > > @@ -11,5 +11,6 @@ > > > struct xe_late_bind; > > > int xe_late_bind_init(struct xe_late_bind *late_bind); > > > +int xe_late_bind_fw_load(struct xe_late_bind *late_bind); > > > #endif > > > diff --git a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h > > > b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h > > > index 93abf4c51789..f119a75f4c9c 100644 > > > --- a/drivers/gpu/drm/xe/xe_late_bind_fw_types.h > > > +++ b/drivers/gpu/drm/xe/xe_late_bind_fw_types.h > > > @@ -9,6 +9,7 @@ > > > #include <linux/iosys-map.h> > > > #include <linux/mutex.h> > > > #include <linux/types.h> > > > +#include <linux/workqueue.h> > > > #define MAX_PAYLOAD_SIZE SZ_4K > > > @@ -38,6 +39,8 @@ struct xe_late_bind_fw { > > > u8 *payload; > > > /** @late_bind_fw.payload_size: late binding blob payload_size > > > */ > > > size_t payload_size; > > > + /** @late_bind_fw.work: worker to upload latebind blob */ > > > + struct work_struct work; > > > }; > > > /** > > > @@ -64,6 +67,10 @@ struct xe_late_bind { > > > struct mutex mutex; > > > /** @late_bind.late_bind_fw: late binding firmware array */ > > > struct xe_late_bind_fw late_bind_fw[XE_LB_FW_MAX_ID]; > > > + /** @late_bind.wq: workqueue to submit request to download late bind > > > blob */ > > > + struct workqueue_struct *wq; > > > + /** @late_bind.component_added: whether the component has been added */ > > > + bool component_added; > > > }; > > > #endif > > > -- > > > 2.34.1 > > > >