Re: [Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.
On Mon, Mar 03, 2014 at 11:35:50AM +0530, deepa...@intel.com wrote: From: Deepak S deepa...@intel.com With RC6 enabled, BYT has an HW issue in determining the right Gfx busyness. WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide on increasing/decreasing the freq. This logic will monitor C0 counters of render/media power-wells over EI period and takes necessary action based on these values v2: Refactor duplicate code. (ville) Signed-off-by: Deepak S deepa...@intel.com Did we reach some conclusion about this approach? It seemed to save power in some workloads at least, but there's still the question whether it ramps up the frquency fast enoguh to provide a good user experience. Maybe we should make it optional even on VLV? --- drivers/gpu/drm/i915/i915_drv.h | 19 ++ drivers/gpu/drm/i915/i915_irq.c | 146 ++-- drivers/gpu/drm/i915/i915_reg.h | 15 + drivers/gpu/drm/i915/intel_pm.c | 50 ++ 4 files changed, 213 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 728b9c3..2baeeef 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -957,6 +957,12 @@ struct i915_suspend_saved_registers { u32 savePCH_PORT_HOTPLUG; }; +struct intel_rps_ei_calc { + u32 cz_ts_ei; + u32 render_ei_c0; + u32 media_ei_c0; +}; + struct intel_gen6_power_mgmt { /* work and pm_iir are protected by dev_priv-irq_lock */ struct work_struct work; @@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt { u8 rp1_delay; u8 rp0_delay; u8 hw_max; + u8 hw_min; Some leftover still? bool rp_up_masked; bool rp_down_masked; + u32 cz_freq; This too seems unused. + u32 ei_interrupt_count; + + bool use_RC0_residency_for_turbo; + int last_adj; enum { LOW_POWER, BETWEEN, HIGH_POWER } power; @@ -1531,6 +1543,13 @@ typedef struct drm_i915_private { /* gen6+ rps state */ struct intel_gen6_power_mgmt rps; + /* rps wa up ei calculation */ + struct intel_rps_ei_calc rps_up_ei; + + /* rps wa down ei calculation */ + struct intel_rps_ei_calc rps_down_ei; + + /* ilk-only ips/rps state. Everything in here is protected by the global * mchdev_lock in intel_pm.c */ struct intel_ilk_power_mgmt ips; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 56edff3..93b6ebf 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv, } } +static u32 vlv_c0_residency(struct drm_i915_private *dev_priv, + struct intel_rps_ei_calc *rps_ei) +{ + u32 cz_ts, cz_freq_khz; + u32 render_count, media_count; + u32 elapsed_render, elapsed_media, elapsed_time; + u32 residency = 0; + + cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP); + cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4); + + render_count = I915_READ(VLV_RENDER_C0_COUNT_REG); + media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG); + + if (rps_ei-cz_ts_ei == 0) { + rps_ei-cz_ts_ei = cz_ts; + rps_ei-render_ei_c0 = render_count; + rps_ei-media_ei_c0 = media_count; + + return dev_priv-rps.cur_delay; + } + + elapsed_time = cz_ts - rps_ei-cz_ts_ei; + rps_ei-cz_ts_ei = cz_ts; + + elapsed_render = render_count - rps_ei-render_ei_c0; + rps_ei-render_ei_c0 = render_count; + + elapsed_media = media_count - rps_ei-media_ei_c0; + rps_ei-media_ei_c0 = media_count; + + /* Convert all the counters into common unit of milli sec */ + elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC; + elapsed_render /= cz_freq_khz; + elapsed_media /= cz_freq_khz; + + /* Calculate overall C0 residency percentage only + * if elapsed time is non zero + */ Badly formatted comment. + if (elapsed_time) { + residency = + ((max(elapsed_render, elapsed_media) * 100) + / elapsed_time); + } + + return residency; +} + + +/** + * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU + * busy-ness calculated from C0 counters of render media power wells + * @dev_priv: DRM device private + * + */ +static u32 vlv_calc_delay_from_C0_counters(struct drm_i915_private *dev_priv) +{ + u32 residency_C0_up = 0, residency_C0_down = 0; + u8 new_delay; + + dev_priv-rps.ei_interrupt_count++; + + WARN_ON(!mutex_is_locked(dev_priv-rps.hw_lock)); + + + if (dev_priv-rps_up_ei.cz_ts_ei == 0) { + vlv_c0_residency(dev_priv, dev_priv-rps_up_ei); + vlv_c0_residency(dev_priv,
Re: [Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.
On 3/5/2014 5:41 PM, Ville Syrjälä wrote: On Mon, Mar 03, 2014 at 11:35:50AM +0530, deepa...@intel.com wrote: From: Deepak S deepa...@intel.com With RC6 enabled, BYT has an HW issue in determining the right Gfx busyness. WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide on increasing/decreasing the freq. This logic will monitor C0 counters of render/media power-wells over EI period and takes necessary action based on these values v2: Refactor duplicate code. (ville) Signed-off-by: Deepak S deepa...@intel.com Did we reach some conclusion about this approach? It seemed to save power in some workloads at least, but there's still the question whether it ramps up the frquency fast enoguh to provide a good user experience. Maybe we should make it optional even on VLV? I have made this as a optional for the VLV. The boost logic is enabled by default. If we need power savings then we can turn off the boost and regular turbo logic will be enabled. I will working on other options that Dainel suggested of identifying the libva workload and controlling the boost at run time. I will address the other comments and upload the patch for review. --- drivers/gpu/drm/i915/i915_drv.h | 19 ++ drivers/gpu/drm/i915/i915_irq.c | 146 ++-- drivers/gpu/drm/i915/i915_reg.h | 15 + drivers/gpu/drm/i915/intel_pm.c | 50 ++ 4 files changed, 213 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 728b9c3..2baeeef 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -957,6 +957,12 @@ struct i915_suspend_saved_registers { u32 savePCH_PORT_HOTPLUG; }; +struct intel_rps_ei_calc { + u32 cz_ts_ei; + u32 render_ei_c0; + u32 media_ei_c0; +}; + struct intel_gen6_power_mgmt { /* work and pm_iir are protected by dev_priv-irq_lock */ struct work_struct work; @@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt { u8 rp1_delay; u8 rp0_delay; u8 hw_max; + u8 hw_min; Some leftover still? bool rp_up_masked; bool rp_down_masked; + u32 cz_freq; This too seems unused. + u32 ei_interrupt_count; + + bool use_RC0_residency_for_turbo; + int last_adj; enum { LOW_POWER, BETWEEN, HIGH_POWER } power; @@ -1531,6 +1543,13 @@ typedef struct drm_i915_private { /* gen6+ rps state */ struct intel_gen6_power_mgmt rps; + /* rps wa up ei calculation */ + struct intel_rps_ei_calc rps_up_ei; + + /* rps wa down ei calculation */ + struct intel_rps_ei_calc rps_down_ei; + + /* ilk-only ips/rps state. Everything in here is protected by the global * mchdev_lock in intel_pm.c */ struct intel_ilk_power_mgmt ips; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 56edff3..93b6ebf 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv, } } +static u32 vlv_c0_residency(struct drm_i915_private *dev_priv, + struct intel_rps_ei_calc *rps_ei) +{ + u32 cz_ts, cz_freq_khz; + u32 render_count, media_count; + u32 elapsed_render, elapsed_media, elapsed_time; + u32 residency = 0; + + cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP); + cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4); + + render_count = I915_READ(VLV_RENDER_C0_COUNT_REG); + media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG); + + if (rps_ei-cz_ts_ei == 0) { + rps_ei-cz_ts_ei = cz_ts; + rps_ei-render_ei_c0 = render_count; + rps_ei-media_ei_c0 = media_count; + + return dev_priv-rps.cur_delay; + } + + elapsed_time = cz_ts - rps_ei-cz_ts_ei; + rps_ei-cz_ts_ei = cz_ts; + + elapsed_render = render_count - rps_ei-render_ei_c0; + rps_ei-render_ei_c0 = render_count; + + elapsed_media = media_count - rps_ei-media_ei_c0; + rps_ei-media_ei_c0 = media_count; + + /* Convert all the counters into common unit of milli sec */ + elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC; + elapsed_render /= cz_freq_khz; + elapsed_media /= cz_freq_khz; + + /* Calculate overall C0 residency percentage only + * if elapsed time is non zero + */ Badly formatted comment. + if (elapsed_time) { + residency = + ((max(elapsed_render, elapsed_media) * 100) + / elapsed_time); + } + + return residency; +} + + +/** + * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU + * busy-ness calculated from C0 counters of render media power wells + * @dev_priv: DRM device private + * + */
Re: [Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.
Hi Ville, Please review the patch and share the comments Thanks Deepak On 3/3/2014 11:35 AM, deepa...@intel.com wrote: From: Deepak S deepa...@intel.com With RC6 enabled, BYT has an HW issue in determining the right Gfx busyness. WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide on increasing/decreasing the freq. This logic will monitor C0 counters of render/media power-wells over EI period and takes necessary action based on these values v2: Refactor duplicate code. (ville) Signed-off-by: Deepak S deepa...@intel.com --- drivers/gpu/drm/i915/i915_drv.h | 19 ++ drivers/gpu/drm/i915/i915_irq.c | 146 ++-- drivers/gpu/drm/i915/i915_reg.h | 15 + drivers/gpu/drm/i915/intel_pm.c | 50 ++ 4 files changed, 213 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 728b9c3..2baeeef 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -957,6 +957,12 @@ struct i915_suspend_saved_registers { u32 savePCH_PORT_HOTPLUG; }; +struct intel_rps_ei_calc { + u32 cz_ts_ei; + u32 render_ei_c0; + u32 media_ei_c0; +}; + struct intel_gen6_power_mgmt { /* work and pm_iir are protected by dev_priv-irq_lock */ struct work_struct work; @@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt { u8 rp1_delay; u8 rp0_delay; u8 hw_max; + u8 hw_min; bool rp_up_masked; bool rp_down_masked; + u32 cz_freq; + u32 ei_interrupt_count; + + bool use_RC0_residency_for_turbo; + int last_adj; enum { LOW_POWER, BETWEEN, HIGH_POWER } power; @@ -1531,6 +1543,13 @@ typedef struct drm_i915_private { /* gen6+ rps state */ struct intel_gen6_power_mgmt rps; + /* rps wa up ei calculation */ + struct intel_rps_ei_calc rps_up_ei; + + /* rps wa down ei calculation */ + struct intel_rps_ei_calc rps_down_ei; + + /* ilk-only ips/rps state. Everything in here is protected by the global * mchdev_lock in intel_pm.c */ struct intel_ilk_power_mgmt ips; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 56edff3..93b6ebf 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv, } } +static u32 vlv_c0_residency(struct drm_i915_private *dev_priv, + struct intel_rps_ei_calc *rps_ei) +{ + u32 cz_ts, cz_freq_khz; + u32 render_count, media_count; + u32 elapsed_render, elapsed_media, elapsed_time; + u32 residency = 0; + + cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP); + cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4); + + render_count = I915_READ(VLV_RENDER_C0_COUNT_REG); + media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG); + + if (rps_ei-cz_ts_ei == 0) { + rps_ei-cz_ts_ei = cz_ts; + rps_ei-render_ei_c0 = render_count; + rps_ei-media_ei_c0 = media_count; + + return dev_priv-rps.cur_delay; + } + + elapsed_time = cz_ts - rps_ei-cz_ts_ei; + rps_ei-cz_ts_ei = cz_ts; + + elapsed_render = render_count - rps_ei-render_ei_c0; + rps_ei-render_ei_c0 = render_count; + + elapsed_media = media_count - rps_ei-media_ei_c0; + rps_ei-media_ei_c0 = media_count; + + /* Convert all the counters into common unit of milli sec */ + elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC; + elapsed_render /= cz_freq_khz; + elapsed_media /= cz_freq_khz; + + /* Calculate overall C0 residency percentage only + * if elapsed time is non zero + */ + if (elapsed_time) { + residency = + ((max(elapsed_render, elapsed_media) * 100) + / elapsed_time); + } + + return residency; +} + + +/** + * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU + * busy-ness calculated from C0 counters of render media power wells + * @dev_priv: DRM device private + * + */ +static u32 vlv_calc_delay_from_C0_counters(struct drm_i915_private *dev_priv) +{ + u32 residency_C0_up = 0, residency_C0_down = 0; + u8 new_delay; + + dev_priv-rps.ei_interrupt_count++; + + WARN_ON(!mutex_is_locked(dev_priv-rps.hw_lock)); + + + if (dev_priv-rps_up_ei.cz_ts_ei == 0) { + vlv_c0_residency(dev_priv, dev_priv-rps_up_ei); + vlv_c0_residency(dev_priv, dev_priv-rps_down_ei); + return dev_priv-rps.cur_delay; + } + + + /* To down throttle, C0 residency should be less than down threshold + * for continous EI intervals. So calculate down EI counters + * once in VLV_INT_COUNT_FOR_DOWN_EI + */ + if
[Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.
From: Deepak S deepa...@intel.com With RC6 enabled, BYT has an HW issue in determining the right Gfx busyness. WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide on increasing/decreasing the freq. This logic will monitor C0 counters of render/media power-wells over EI period and takes necessary action based on these values v2: Refactor duplicate code. (ville) Signed-off-by: Deepak S deepa...@intel.com --- drivers/gpu/drm/i915/i915_drv.h | 19 ++ drivers/gpu/drm/i915/i915_irq.c | 146 ++-- drivers/gpu/drm/i915/i915_reg.h | 15 + drivers/gpu/drm/i915/intel_pm.c | 50 ++ 4 files changed, 213 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 728b9c3..2baeeef 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -957,6 +957,12 @@ struct i915_suspend_saved_registers { u32 savePCH_PORT_HOTPLUG; }; +struct intel_rps_ei_calc { + u32 cz_ts_ei; + u32 render_ei_c0; + u32 media_ei_c0; +}; + struct intel_gen6_power_mgmt { /* work and pm_iir are protected by dev_priv-irq_lock */ struct work_struct work; @@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt { u8 rp1_delay; u8 rp0_delay; u8 hw_max; + u8 hw_min; bool rp_up_masked; bool rp_down_masked; + u32 cz_freq; + u32 ei_interrupt_count; + + bool use_RC0_residency_for_turbo; + int last_adj; enum { LOW_POWER, BETWEEN, HIGH_POWER } power; @@ -1531,6 +1543,13 @@ typedef struct drm_i915_private { /* gen6+ rps state */ struct intel_gen6_power_mgmt rps; + /* rps wa up ei calculation */ + struct intel_rps_ei_calc rps_up_ei; + + /* rps wa down ei calculation */ + struct intel_rps_ei_calc rps_down_ei; + + /* ilk-only ips/rps state. Everything in here is protected by the global * mchdev_lock in intel_pm.c */ struct intel_ilk_power_mgmt ips; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 56edff3..93b6ebf 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv, } } +static u32 vlv_c0_residency(struct drm_i915_private *dev_priv, + struct intel_rps_ei_calc *rps_ei) +{ + u32 cz_ts, cz_freq_khz; + u32 render_count, media_count; + u32 elapsed_render, elapsed_media, elapsed_time; + u32 residency = 0; + + cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP); + cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4); + + render_count = I915_READ(VLV_RENDER_C0_COUNT_REG); + media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG); + + if (rps_ei-cz_ts_ei == 0) { + rps_ei-cz_ts_ei = cz_ts; + rps_ei-render_ei_c0 = render_count; + rps_ei-media_ei_c0 = media_count; + + return dev_priv-rps.cur_delay; + } + + elapsed_time = cz_ts - rps_ei-cz_ts_ei; + rps_ei-cz_ts_ei = cz_ts; + + elapsed_render = render_count - rps_ei-render_ei_c0; + rps_ei-render_ei_c0 = render_count; + + elapsed_media = media_count - rps_ei-media_ei_c0; + rps_ei-media_ei_c0 = media_count; + + /* Convert all the counters into common unit of milli sec */ + elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC; + elapsed_render /= cz_freq_khz; + elapsed_media /= cz_freq_khz; + + /* Calculate overall C0 residency percentage only + * if elapsed time is non zero + */ + if (elapsed_time) { + residency = + ((max(elapsed_render, elapsed_media) * 100) + / elapsed_time); + } + + return residency; +} + + +/** + * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU + * busy-ness calculated from C0 counters of render media power wells + * @dev_priv: DRM device private + * + */ +static u32 vlv_calc_delay_from_C0_counters(struct drm_i915_private *dev_priv) +{ + u32 residency_C0_up = 0, residency_C0_down = 0; + u8 new_delay; + + dev_priv-rps.ei_interrupt_count++; + + WARN_ON(!mutex_is_locked(dev_priv-rps.hw_lock)); + + + if (dev_priv-rps_up_ei.cz_ts_ei == 0) { + vlv_c0_residency(dev_priv, dev_priv-rps_up_ei); + vlv_c0_residency(dev_priv, dev_priv-rps_down_ei); + return dev_priv-rps.cur_delay; + } + + + /* To down throttle, C0 residency should be less than down threshold + * for continous EI intervals. So calculate down EI counters + * once in VLV_INT_COUNT_FOR_DOWN_EI + */ + if (dev_priv-rps.ei_interrupt_count == VLV_INT_COUNT_FOR_DOWN_EI) { + + dev_priv-rps.ei_interrupt_count = 0; + +