On Fri, 30 May 2025 16:30:46 +1000
Jonathan Gray <[email protected]> wrote:
> On Fri, May 30, 2025 at 10:14:36AM +0900, YASUOKA Masahiko wrote:
>> On Fri, 30 May 2025 10:08:38 +1000
>> Jonathan Gray <[email protected]> wrote:
>> > On Fri, May 30, 2025 at 08:01:31AM +0900, YASUOKA Masahiko wrote:
>> >> 
>> >> >Synopsis:        inteldrm stop working after {hibernate,suspend}/resume
>> >> >Category:        kernel
>> >> >Environment:
>> >>   System      : OpenBSD 7.7
>> >>   Details     : OpenBSD 7.7-current (GENERIC.MP) #117: Thu May 29 
>> >> 21:18:15 JST 2025
>> >>                    
>> >> yasuoka@xxx:/home/yasuoka/src/sys/arch/amd64/compile/GENERIC.MP
>> >> 
>> >>   Architecture: OpenBSD.amd64
>> >>   Machine     : amd64
>> >> >Description:
>> >>   After hibernate and resume, X11 stops working.  Keyboard and
>> >>   mouse don't work, but Ctrl-Alt-F1 or Ctrl-Alt-Backspace works.
>> >> 
>> >>   errors in dmesg:
>> >>   ****
>> >>   drm:pid97650:__uc_init_hw *ERROR* [drm] *ERROR* GT0: GuC initialization 
>> >> failed 0xfffffffffffffffae
>> >>   drm:pid97650:intel_gt_init_hw *ERROR* [drm] *ERROR* GT0: Enabling uc 
>> >> failed (-5)
>> >>   drm:pid97650:intel_gt_resume *ERROR* [drm] *ERROR* GT0: Failed to 
>> >> initialize GPU, declaring it wedged!
>> >>   ****
>> >> 
>> >>   This happens because guc_wait_ucode() in i915/gt/uc/intel_guc_fw.c
>> >>   fails.
>> >> 
>> >>   The function is to wait for the GuC to start up by calling the inline
>> >>   function guc_load_done() and the function checks two regisiters.
>> >> 
>> >>        97 static inline bool guc_load_done(struct intel_uncore *uncore, 
>> >> u32 *status, bool *success)
>> >>        98 {
>> >>        99         u32 val = intel_uncore_read(uncore, GUC_STATUS);
>> >>       100         u32 uk_val = REG_FIELD_GET(GS_UKERNEL_MASK, val);
>> >>       101         u32 br_val = REG_FIELD_GET(GS_BOOTROM_MASK, val);
>> >>       102 
>> >>       103         *status = val;
>> >>       104         switch (uk_val) {
>> >>       105         case INTEL_GUC_LOAD_STATUS_READY:
>> >>       106                 *success = true;
>> >>       107                 return true;
>> >>       108 
>> >>       109         case INTEL_GUC_LOAD_STATUS_ERROR_DEVID_BUILD_MISMATCH:
>> >>       110         case INTEL_GUC_LOAD_STATUS_GUC_PREPROD_BUILD_MISMATCH:
>> >>       111         case INTEL_GUC_LOAD_STATUS_ERROR_DEVID_INVALID_GUCTYPE:
>> >> 
>> >>   In my test, the functions fails with the resgisters:
>> >> 
>> >>     ukernel = INTEL_GUC_LOAD_STATUS_INIT_DATA_INVALID(0x71)
>> >>     bootrom = INTEL_BOOTROM_STATUS_JUMP_PASSED(0x76)
>> >> 
>> >>   When I was using 7.6, I didn't see this problem.
>> >> 
>> >> >How-To-Repeat:
>> >>   1. hibernate or suspend
>> >>   2. resume
>> >> 
>> >>   the problem happens always (~10 times)
>> >> 
>> >>   After the workaround diff, not happen always (~3 times)
>> >>   
>> >> >Fix:
>> >>   Also the diff attached at last, workaround the problem.
>> >> 
>> >>   The diff partially backouts the change on Feb 7 and add a printf().
>> >> 
>> >>   I don't understand it logically, but if the printf() is removed, the
>> >>   problem start happening.
>> > 
>> > Thank you for the report.
>> > 
>> > Does this smaller diff still workaround the problem?
>> 
>> The smaller diff doesn't fix the problem.  I tried 2 times.
> 
> Here is the other part of your initial diff.
> 
> The non-printf parts are a revert of
> 'drm/i915/guc: Change wa and EU_PERF_CNTL registers to MCR type'
> linux 835e4d9bb3a13879031942ca6692d5a82ec00158

This doesn't fix the problem.

> It would also be helpful if you could try raise the value
> of GUC_LOAD_RETRY_LIMIT in intel_guc_fw.c without other patches,
> to find a value that works.

Also I tried GUC_LOAD_RETRY_LIMIT=20 with the diff, it doesn't fix as
well..

> 
> Index: sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c,v
> diff -u -p -r1.9 intel_guc_ads.c
> --- sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c        7 Feb 2025 03:03:30 
> -0000       1.9
> +++ sys/dev/pci/drm/i915/gt/uc/intel_guc_ads.c        30 May 2025 05:58:58 
> -0000
> @@ -408,13 +408,8 @@ static int guc_mmio_regset_init(struct t
>           CCS_MASK(engine->gt))
>               ret |= GUC_MMIO_REG_ADD(gt, regset, GEN12_RCU_MODE, true);
>  
> -     /*
> -      * some of the WA registers are MCR registers. As it is safe to
> -      * use MCR form for non-MCR registers, for code simplicity, all
> -      * WA registers are added with MCR form.
> -      */
>       for (i = 0, wa = wal->list; i < wal->count; i++, wa++)
> -             ret |= GUC_MCR_REG_ADD(gt, regset, wa->mcr_reg, wa->masked_reg);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, wa->reg, wa->masked_reg);
>  
>       /* Be extra paranoid and include all whitelist registers. */
>       for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++)
> @@ -430,13 +425,13 @@ static int guc_mmio_regset_init(struct t
>                       ret |= GUC_MMIO_REG_ADD(gt, regset, GEN9_LNCFCMOCS(i), 
> false);
>  
>       if (GRAPHICS_VER(engine->i915) >= 12) {
> -             ret |= GUC_MCR_REG_ADD(gt, regset, 
> MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL0)), false);
> -             ret |= GUC_MCR_REG_ADD(gt, regset, 
> MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL1)), false);
> -             ret |= GUC_MCR_REG_ADD(gt, regset, 
> MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL2)), false);
> -             ret |= GUC_MCR_REG_ADD(gt, regset, 
> MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL3)), false);
> -             ret |= GUC_MCR_REG_ADD(gt, regset, 
> MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL4)), false);
> -             ret |= GUC_MCR_REG_ADD(gt, regset, 
> MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL5)), false);
> -             ret |= GUC_MCR_REG_ADD(gt, regset, 
> MCR_REG(i915_mmio_reg_offset(EU_PERF_CNTL6)), false);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL0, false);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL1, false);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL2, false);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL3, false);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL4, false);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL5, false);
> +             ret |= GUC_MMIO_REG_ADD(gt, regset, EU_PERF_CNTL6, false);
>       }
>  
>       return ret ? -1 : 0;
> Index: sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c,v
> diff -u -p -r1.8 intel_guc_fw.c
> --- sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c 7 Feb 2025 03:03:30 -0000       
> 1.8
> +++ sys/dev/pci/drm/i915/gt/uc/intel_guc_fw.c 30 May 2025 05:59:52 -0000
> @@ -197,6 +197,7 @@ static int guc_wait_ucode(struct intel_g
>                       REG_FIELD_GET(GS_BOOTROM_MASK, status),
>                       REG_FIELD_GET(GS_UKERNEL_MASK, status));
>       }
> +     printf("%s: count = %d, ret = %d\n", __func__, count, ret);
>       after = ktime_get();
>       delta = ktime_sub(after, before);
>       delta_ms = ktime_to_ms(delta);
> 
> 

Reply via email to