Re: [Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.

2014-03-05 Thread Ville Syrjälä
On Mon, Mar 03, 2014 at 11:35:50AM +0530, deepa...@intel.com wrote:
 From: Deepak S deepa...@intel.com
 
 With RC6 enabled, BYT has an HW issue in determining the right
 Gfx busyness.
 WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide
 on increasing/decreasing the freq. This logic will monitor C0
 counters of render/media power-wells over EI period and takes
 necessary action based on these values
 
 v2: Refactor duplicate code. (ville)
 
 Signed-off-by: Deepak S deepa...@intel.com

Did we reach some conclusion about this approach? It seemed to save
power in some workloads at least, but there's still the question
whether it ramps up the frquency fast enoguh to provide a good user
experience. Maybe we should make it optional even on VLV?

 
 ---
  drivers/gpu/drm/i915/i915_drv.h |  19 ++
  drivers/gpu/drm/i915/i915_irq.c | 146 
 ++--
  drivers/gpu/drm/i915/i915_reg.h |  15 +
  drivers/gpu/drm/i915/intel_pm.c |  50 ++
  4 files changed, 213 insertions(+), 17 deletions(-)
 
 diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
 index 728b9c3..2baeeef 100644
 --- a/drivers/gpu/drm/i915/i915_drv.h
 +++ b/drivers/gpu/drm/i915/i915_drv.h
 @@ -957,6 +957,12 @@ struct i915_suspend_saved_registers {
   u32 savePCH_PORT_HOTPLUG;
  };
  
 +struct intel_rps_ei_calc {
 + u32 cz_ts_ei;
 + u32 render_ei_c0;
 + u32 media_ei_c0;
 +};
 +
  struct intel_gen6_power_mgmt {
   /* work and pm_iir are protected by dev_priv-irq_lock */
   struct work_struct work;
 @@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt {
   u8 rp1_delay;
   u8 rp0_delay;
   u8 hw_max;
 + u8 hw_min;

Some leftover still?

  
   bool rp_up_masked;
   bool rp_down_masked;
  
 + u32 cz_freq;

This too seems unused.

 + u32 ei_interrupt_count;
 +
 + bool use_RC0_residency_for_turbo;
 +
   int last_adj;
   enum { LOW_POWER, BETWEEN, HIGH_POWER } power;
  
 @@ -1531,6 +1543,13 @@ typedef struct drm_i915_private {
   /* gen6+ rps state */
   struct intel_gen6_power_mgmt rps;
  
 + /* rps wa up ei calculation */
 + struct intel_rps_ei_calc rps_up_ei;
 +
 + /* rps wa down ei calculation */
 + struct intel_rps_ei_calc rps_down_ei;
 +
 +
   /* ilk-only ips/rps state. Everything in here is protected by the global
* mchdev_lock in intel_pm.c */
   struct intel_ilk_power_mgmt ips;
 diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
 index 56edff3..93b6ebf 100644
 --- a/drivers/gpu/drm/i915/i915_irq.c
 +++ b/drivers/gpu/drm/i915/i915_irq.c
 @@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private 
 *dev_priv,
   }
  }
  
 +static u32 vlv_c0_residency(struct drm_i915_private *dev_priv,
 + struct  intel_rps_ei_calc *rps_ei)
 +{
 + u32 cz_ts, cz_freq_khz;
 + u32 render_count, media_count;
 + u32 elapsed_render, elapsed_media, elapsed_time;
 + u32 residency = 0;
 +
 + cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP);
 + cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4);
 +
 + render_count = I915_READ(VLV_RENDER_C0_COUNT_REG);
 + media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG);
 +
 + if (rps_ei-cz_ts_ei == 0) {
 + rps_ei-cz_ts_ei = cz_ts;
 + rps_ei-render_ei_c0 = render_count;
 + rps_ei-media_ei_c0 = media_count;
 +
 + return dev_priv-rps.cur_delay;
 + }
 +
 + elapsed_time = cz_ts - rps_ei-cz_ts_ei;
 + rps_ei-cz_ts_ei = cz_ts;
 +
 + elapsed_render = render_count - rps_ei-render_ei_c0;
 + rps_ei-render_ei_c0 = render_count;
 +
 + elapsed_media = media_count - rps_ei-media_ei_c0;
 + rps_ei-media_ei_c0 = media_count;
 +
 + /* Convert all the counters into common unit of milli sec */
 + elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC;
 + elapsed_render /=  cz_freq_khz;
 + elapsed_media /= cz_freq_khz;
 +
 + /* Calculate overall C0 residency percentage only
 + * if elapsed time is non zero
 + */

Badly formatted comment.

 + if (elapsed_time) {
 + residency =
 + ((max(elapsed_render, elapsed_media) * 100)
 + / elapsed_time);
 + }
 +
 + return residency;
 +}
 +
 +
 +/**
 + * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU
 + * busy-ness calculated from C0 counters of render  media power wells
 + * @dev_priv: DRM device private
 + *
 + */
 +static u32 vlv_calc_delay_from_C0_counters(struct drm_i915_private *dev_priv)
 +{
 + u32 residency_C0_up = 0, residency_C0_down = 0;
 + u8 new_delay;
 +
 + dev_priv-rps.ei_interrupt_count++;
 +
 + WARN_ON(!mutex_is_locked(dev_priv-rps.hw_lock));
 +
 +
 + if (dev_priv-rps_up_ei.cz_ts_ei == 0) {
 + vlv_c0_residency(dev_priv, dev_priv-rps_up_ei);
 + vlv_c0_residency(dev_priv, 

Re: [Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.

2014-03-05 Thread S, Deepak



On 3/5/2014 5:41 PM, Ville Syrjälä wrote:

On Mon, Mar 03, 2014 at 11:35:50AM +0530, deepa...@intel.com wrote:

From: Deepak S deepa...@intel.com

With RC6 enabled, BYT has an HW issue in determining the right
Gfx busyness.
WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide
on increasing/decreasing the freq. This logic will monitor C0
counters of render/media power-wells over EI period and takes
necessary action based on these values

v2: Refactor duplicate code. (ville)

Signed-off-by: Deepak S deepa...@intel.com


Did we reach some conclusion about this approach? It seemed to save
power in some workloads at least, but there's still the question
whether it ramps up the frquency fast enoguh to provide a good user
experience. Maybe we should make it optional even on VLV?


I have made this as a optional for the VLV. The boost logic is enabled 
by default. If we need power savings then we can turn off the boost and 
regular turbo logic will be enabled.


I will working on other options that Dainel suggested of identifying the 
libva workload and controlling the boost at run time.


I will address the other comments and upload the patch for review.



---
  drivers/gpu/drm/i915/i915_drv.h |  19 ++
  drivers/gpu/drm/i915/i915_irq.c | 146 ++--
  drivers/gpu/drm/i915/i915_reg.h |  15 +
  drivers/gpu/drm/i915/intel_pm.c |  50 ++
  4 files changed, 213 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 728b9c3..2baeeef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -957,6 +957,12 @@ struct i915_suspend_saved_registers {
u32 savePCH_PORT_HOTPLUG;
  };

+struct intel_rps_ei_calc {
+   u32 cz_ts_ei;
+   u32 render_ei_c0;
+   u32 media_ei_c0;
+};
+
  struct intel_gen6_power_mgmt {
/* work and pm_iir are protected by dev_priv-irq_lock */
struct work_struct work;
@@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt {
u8 rp1_delay;
u8 rp0_delay;
u8 hw_max;
+   u8 hw_min;


Some leftover still?



bool rp_up_masked;
bool rp_down_masked;

+   u32 cz_freq;


This too seems unused.


+   u32 ei_interrupt_count;
+
+   bool use_RC0_residency_for_turbo;
+
int last_adj;
enum { LOW_POWER, BETWEEN, HIGH_POWER } power;

@@ -1531,6 +1543,13 @@ typedef struct drm_i915_private {
/* gen6+ rps state */
struct intel_gen6_power_mgmt rps;

+   /* rps wa up ei calculation */
+   struct intel_rps_ei_calc rps_up_ei;
+
+   /* rps wa down ei calculation */
+   struct intel_rps_ei_calc rps_down_ei;
+
+
/* ilk-only ips/rps state. Everything in here is protected by the global
 * mchdev_lock in intel_pm.c */
struct intel_ilk_power_mgmt ips;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 56edff3..93b6ebf 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv,
}
  }

+static u32 vlv_c0_residency(struct drm_i915_private *dev_priv,
+   struct  intel_rps_ei_calc *rps_ei)
+{
+   u32 cz_ts, cz_freq_khz;
+   u32 render_count, media_count;
+   u32 elapsed_render, elapsed_media, elapsed_time;
+   u32 residency = 0;
+
+   cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP);
+   cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4);
+
+   render_count = I915_READ(VLV_RENDER_C0_COUNT_REG);
+   media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG);
+
+   if (rps_ei-cz_ts_ei == 0) {
+   rps_ei-cz_ts_ei = cz_ts;
+   rps_ei-render_ei_c0 = render_count;
+   rps_ei-media_ei_c0 = media_count;
+
+   return dev_priv-rps.cur_delay;
+   }
+
+   elapsed_time = cz_ts - rps_ei-cz_ts_ei;
+   rps_ei-cz_ts_ei = cz_ts;
+
+   elapsed_render = render_count - rps_ei-render_ei_c0;
+   rps_ei-render_ei_c0 = render_count;
+
+   elapsed_media = media_count - rps_ei-media_ei_c0;
+   rps_ei-media_ei_c0 = media_count;
+
+   /* Convert all the counters into common unit of milli sec */
+   elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC;
+   elapsed_render /=  cz_freq_khz;
+   elapsed_media /= cz_freq_khz;
+
+   /* Calculate overall C0 residency percentage only
+   * if elapsed time is non zero
+   */


Badly formatted comment.


+   if (elapsed_time) {
+   residency =
+   ((max(elapsed_render, elapsed_media) * 100)
+   / elapsed_time);
+   }
+
+   return residency;
+}
+
+
+/**
+ * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU
+ * busy-ness calculated from C0 counters of render  media power wells
+ * @dev_priv: DRM device private
+ *
+ */

Re: [Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.

2014-03-04 Thread S, Deepak

Hi Ville,

Please review the patch and share the comments

Thanks
Deepak

On 3/3/2014 11:35 AM, deepa...@intel.com wrote:

From: Deepak S deepa...@intel.com

With RC6 enabled, BYT has an HW issue in determining the right
Gfx busyness.
WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide
on increasing/decreasing the freq. This logic will monitor C0
counters of render/media power-wells over EI period and takes
necessary action based on these values

v2: Refactor duplicate code. (ville)

Signed-off-by: Deepak S deepa...@intel.com

---
  drivers/gpu/drm/i915/i915_drv.h |  19 ++
  drivers/gpu/drm/i915/i915_irq.c | 146 ++--
  drivers/gpu/drm/i915/i915_reg.h |  15 +
  drivers/gpu/drm/i915/intel_pm.c |  50 ++
  4 files changed, 213 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 728b9c3..2baeeef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -957,6 +957,12 @@ struct i915_suspend_saved_registers {
u32 savePCH_PORT_HOTPLUG;
  };

+struct intel_rps_ei_calc {
+   u32 cz_ts_ei;
+   u32 render_ei_c0;
+   u32 media_ei_c0;
+};
+
  struct intel_gen6_power_mgmt {
/* work and pm_iir are protected by dev_priv-irq_lock */
struct work_struct work;
@@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt {
u8 rp1_delay;
u8 rp0_delay;
u8 hw_max;
+   u8 hw_min;

bool rp_up_masked;
bool rp_down_masked;

+   u32 cz_freq;
+   u32 ei_interrupt_count;
+
+   bool use_RC0_residency_for_turbo;
+
int last_adj;
enum { LOW_POWER, BETWEEN, HIGH_POWER } power;

@@ -1531,6 +1543,13 @@ typedef struct drm_i915_private {
/* gen6+ rps state */
struct intel_gen6_power_mgmt rps;

+   /* rps wa up ei calculation */
+   struct intel_rps_ei_calc rps_up_ei;
+
+   /* rps wa down ei calculation */
+   struct intel_rps_ei_calc rps_down_ei;
+
+
/* ilk-only ips/rps state. Everything in here is protected by the global
 * mchdev_lock in intel_pm.c */
struct intel_ilk_power_mgmt ips;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 56edff3..93b6ebf 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv,
}
  }

+static u32 vlv_c0_residency(struct drm_i915_private *dev_priv,
+   struct  intel_rps_ei_calc *rps_ei)
+{
+   u32 cz_ts, cz_freq_khz;
+   u32 render_count, media_count;
+   u32 elapsed_render, elapsed_media, elapsed_time;
+   u32 residency = 0;
+
+   cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP);
+   cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4);
+
+   render_count = I915_READ(VLV_RENDER_C0_COUNT_REG);
+   media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG);
+
+   if (rps_ei-cz_ts_ei == 0) {
+   rps_ei-cz_ts_ei = cz_ts;
+   rps_ei-render_ei_c0 = render_count;
+   rps_ei-media_ei_c0 = media_count;
+
+   return dev_priv-rps.cur_delay;
+   }
+
+   elapsed_time = cz_ts - rps_ei-cz_ts_ei;
+   rps_ei-cz_ts_ei = cz_ts;
+
+   elapsed_render = render_count - rps_ei-render_ei_c0;
+   rps_ei-render_ei_c0 = render_count;
+
+   elapsed_media = media_count - rps_ei-media_ei_c0;
+   rps_ei-media_ei_c0 = media_count;
+
+   /* Convert all the counters into common unit of milli sec */
+   elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC;
+   elapsed_render /=  cz_freq_khz;
+   elapsed_media /= cz_freq_khz;
+
+   /* Calculate overall C0 residency percentage only
+   * if elapsed time is non zero
+   */
+   if (elapsed_time) {
+   residency =
+   ((max(elapsed_render, elapsed_media) * 100)
+   / elapsed_time);
+   }
+
+   return residency;
+}
+
+
+/**
+ * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU
+ * busy-ness calculated from C0 counters of render  media power wells
+ * @dev_priv: DRM device private
+ *
+ */
+static u32 vlv_calc_delay_from_C0_counters(struct drm_i915_private *dev_priv)
+{
+   u32 residency_C0_up = 0, residency_C0_down = 0;
+   u8 new_delay;
+
+   dev_priv-rps.ei_interrupt_count++;
+
+   WARN_ON(!mutex_is_locked(dev_priv-rps.hw_lock));
+
+
+   if (dev_priv-rps_up_ei.cz_ts_ei == 0) {
+   vlv_c0_residency(dev_priv, dev_priv-rps_up_ei);
+   vlv_c0_residency(dev_priv, dev_priv-rps_down_ei);
+   return dev_priv-rps.cur_delay;
+   }
+
+
+   /* To down throttle, C0 residency should be less than down threshold
+   * for continous EI intervals. So calculate down EI counters
+   * once in VLV_INT_COUNT_FOR_DOWN_EI
+   */
+   if 

[Intel-gfx] [PATCH v2] drm/i915/vlv: WA for Turbo and RC6 to work together.

2014-03-02 Thread deepak . s
From: Deepak S deepa...@intel.com

With RC6 enabled, BYT has an HW issue in determining the right
Gfx busyness.
WA for Turbo + RC6: Use SW based Gfx busy-ness detection to decide
on increasing/decreasing the freq. This logic will monitor C0
counters of render/media power-wells over EI period and takes
necessary action based on these values

v2: Refactor duplicate code. (ville)

Signed-off-by: Deepak S deepa...@intel.com

---
 drivers/gpu/drm/i915/i915_drv.h |  19 ++
 drivers/gpu/drm/i915/i915_irq.c | 146 ++--
 drivers/gpu/drm/i915/i915_reg.h |  15 +
 drivers/gpu/drm/i915/intel_pm.c |  50 ++
 4 files changed, 213 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 728b9c3..2baeeef 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -957,6 +957,12 @@ struct i915_suspend_saved_registers {
u32 savePCH_PORT_HOTPLUG;
 };
 
+struct intel_rps_ei_calc {
+   u32 cz_ts_ei;
+   u32 render_ei_c0;
+   u32 media_ei_c0;
+};
+
 struct intel_gen6_power_mgmt {
/* work and pm_iir are protected by dev_priv-irq_lock */
struct work_struct work;
@@ -969,10 +975,16 @@ struct intel_gen6_power_mgmt {
u8 rp1_delay;
u8 rp0_delay;
u8 hw_max;
+   u8 hw_min;
 
bool rp_up_masked;
bool rp_down_masked;
 
+   u32 cz_freq;
+   u32 ei_interrupt_count;
+
+   bool use_RC0_residency_for_turbo;
+
int last_adj;
enum { LOW_POWER, BETWEEN, HIGH_POWER } power;
 
@@ -1531,6 +1543,13 @@ typedef struct drm_i915_private {
/* gen6+ rps state */
struct intel_gen6_power_mgmt rps;
 
+   /* rps wa up ei calculation */
+   struct intel_rps_ei_calc rps_up_ei;
+
+   /* rps wa down ei calculation */
+   struct intel_rps_ei_calc rps_down_ei;
+
+
/* ilk-only ips/rps state. Everything in here is protected by the global
 * mchdev_lock in intel_pm.c */
struct intel_ilk_power_mgmt ips;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 56edff3..93b6ebf 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1023,6 +1023,120 @@ void gen6_set_pm_mask(struct drm_i915_private *dev_priv,
}
 }
 
+static u32 vlv_c0_residency(struct drm_i915_private *dev_priv,
+   struct  intel_rps_ei_calc *rps_ei)
+{
+   u32 cz_ts, cz_freq_khz;
+   u32 render_count, media_count;
+   u32 elapsed_render, elapsed_media, elapsed_time;
+   u32 residency = 0;
+
+   cz_ts = vlv_punit_read(dev_priv, PUNIT_REG_CZ_TIMESTAMP);
+   cz_freq_khz = DIV_ROUND_CLOSEST(dev_priv-mem_freq * 1000, 4);
+
+   render_count = I915_READ(VLV_RENDER_C0_COUNT_REG);
+   media_count = I915_READ(VLV_MEDIA_C0_COUNT_REG);
+
+   if (rps_ei-cz_ts_ei == 0) {
+   rps_ei-cz_ts_ei = cz_ts;
+   rps_ei-render_ei_c0 = render_count;
+   rps_ei-media_ei_c0 = media_count;
+
+   return dev_priv-rps.cur_delay;
+   }
+
+   elapsed_time = cz_ts - rps_ei-cz_ts_ei;
+   rps_ei-cz_ts_ei = cz_ts;
+
+   elapsed_render = render_count - rps_ei-render_ei_c0;
+   rps_ei-render_ei_c0 = render_count;
+
+   elapsed_media = media_count - rps_ei-media_ei_c0;
+   rps_ei-media_ei_c0 = media_count;
+
+   /* Convert all the counters into common unit of milli sec */
+   elapsed_time /= VLV_CZ_CLOCK_TO_MILLI_SEC;
+   elapsed_render /=  cz_freq_khz;
+   elapsed_media /= cz_freq_khz;
+
+   /* Calculate overall C0 residency percentage only
+   * if elapsed time is non zero
+   */
+   if (elapsed_time) {
+   residency =
+   ((max(elapsed_render, elapsed_media) * 100)
+   / elapsed_time);
+   }
+
+   return residency;
+}
+
+
+/**
+ * vlv_calc_delay_from_C0_counters - Increase/Decrease freq based on GPU
+ * busy-ness calculated from C0 counters of render  media power wells
+ * @dev_priv: DRM device private
+ *
+ */
+static u32 vlv_calc_delay_from_C0_counters(struct drm_i915_private *dev_priv)
+{
+   u32 residency_C0_up = 0, residency_C0_down = 0;
+   u8 new_delay;
+
+   dev_priv-rps.ei_interrupt_count++;
+
+   WARN_ON(!mutex_is_locked(dev_priv-rps.hw_lock));
+
+
+   if (dev_priv-rps_up_ei.cz_ts_ei == 0) {
+   vlv_c0_residency(dev_priv, dev_priv-rps_up_ei);
+   vlv_c0_residency(dev_priv, dev_priv-rps_down_ei);
+   return dev_priv-rps.cur_delay;
+   }
+
+
+   /* To down throttle, C0 residency should be less than down threshold
+   * for continous EI intervals. So calculate down EI counters
+   * once in VLV_INT_COUNT_FOR_DOWN_EI
+   */
+   if (dev_priv-rps.ei_interrupt_count == VLV_INT_COUNT_FOR_DOWN_EI) {
+
+   dev_priv-rps.ei_interrupt_count = 0;
+
+