Re: [Mesa-dev] [PATCH v3 11/42] i965: Implement L3 state atom.

2015-11-26 Thread Francisco Jerez
Matt Turner  writes:

> On Wed, Nov 25, 2015 at 6:26 AM, Francisco Jerez  
> wrote:
>> The L3 state atom calculates the target L3 partition weights when the
>> program bound to some shader stage is modified, and in case they are
>> far enough from the current partitioning it makes sure that the L3
>> state is re-emitted.
>>
>> v3: Fix for inconsistent units the context URB size is expressed in.
>> Clamp URB size to 1008 KB on SKL due to FF hardware limitation.
>> ---
>>  src/mesa/drivers/dri/i965/brw_context.h   |  6 +++
>>  src/mesa/drivers/dri/i965/brw_state.h |  1 +
>>  src/mesa/drivers/dri/i965/gen7_l3_state.c | 81 
>> +++
>>  3 files changed, 88 insertions(+)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
>> b/src/mesa/drivers/dri/i965/brw_context.h
>> index 9983454..b4f8de6 100644
>> --- a/src/mesa/drivers/dri/i965/brw_context.h
>> +++ b/src/mesa/drivers/dri/i965/brw_context.h
>> @@ -678,6 +678,8 @@ enum brw_predicate_state {
>>
>>  struct shader_times;
>>
>> +struct brw_l3_config;
>> +
>>  /**
>>   * brw_context is derived from gl_context.
>>   */
>> @@ -1220,6 +1222,10 @@ struct brw_context
>> int basevertex;
>>
>> struct {
>> +  const struct brw_l3_config *config;
>> +   } l3;
>> +
>> +   struct {
>>drm_intel_bo *bo;
>>const char **names;
>>int *ids;
>> diff --git a/src/mesa/drivers/dri/i965/brw_state.h 
>> b/src/mesa/drivers/dri/i965/brw_state.h
>> index 94734ba..49f301a 100644
>> --- a/src/mesa/drivers/dri/i965/brw_state.h
>> +++ b/src/mesa/drivers/dri/i965/brw_state.h
>> @@ -129,6 +129,7 @@ extern const struct brw_tracked_state gen7_depthbuffer;
>>  extern const struct brw_tracked_state gen7_clip_state;
>>  extern const struct brw_tracked_state gen7_disable_stages;
>>  extern const struct brw_tracked_state gen7_gs_state;
>> +extern const struct brw_tracked_state gen7_l3_state;
>>  extern const struct brw_tracked_state gen7_ps_state;
>>  extern const struct brw_tracked_state gen7_push_constant_space;
>>  extern const struct brw_tracked_state gen7_sbe_state;
>> diff --git a/src/mesa/drivers/dri/i965/gen7_l3_state.c 
>> b/src/mesa/drivers/dri/i965/gen7_l3_state.c
>> index 70a61ae..4c314f5 100644
>> --- a/src/mesa/drivers/dri/i965/gen7_l3_state.c
>> +++ b/src/mesa/drivers/dri/i965/gen7_l3_state.c
>> @@ -418,3 +418,84 @@ setup_l3_config(struct brw_context *brw, const struct 
>> brw_l3_config *cfg)
>>}
>> }
>>  }
>> +
>> +/**
>> + * Return the unit brw_context::urb::size is expressed in, in KB.  \sa
>> + * brw_device_info::urb::size.
>> + */
>> +static unsigned
>> +get_urb_size_scale(const struct brw_device_info *devinfo)
>> +{
>> +   return (devinfo->gen >= 8 ? devinfo->num_slices : 1);
>> +}
>> +
>> +/**
>> + * Update the URB size in the context state for the specified L3
>> + * configuration.
>> + */
>> +static void
>> +update_urb_size(struct brw_context *brw, const struct brw_l3_config *cfg)
>> +{
>> +   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
>> +   /* From the SKL "L3 Allocation and Programming" documentation:
>> +*
>> +* "URB is limited to 1008KB due to programming restrictions.  This is 
>> not
>> +* a restriction of the L3 implementation, but of the FF and other 
>> clients.
>> +* Therefore, in a GT4 implementation it is possible for the programmed
>> +* allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
>> +* only 1008KB of this will be used."
>> +*/
>> +   const unsigned max = (devinfo->gen == 9 ? 1008 : ~0);
>> +   const unsigned sz =
>> +  MIN2(max, cfg->n[L3P_URB] * get_l3_way_size(devinfo)) /
>> +  get_urb_size_scale(devinfo);
>> +
>> +   if (brw->urb.size != sz) {
>> +  brw->urb.size = sz;
>> +  brw->ctx.NewDriverState |= BRW_NEW_URB_SIZE;
>> +   }
>> +}
>> +
>> +static void
>> +emit_l3_state(struct brw_context *brw)
>> +{
>> +   const struct brw_l3_weights w = get_pipeline_state_l3_weights(brw);
>> +   const float dw = diff_l3_weights(w, 
>> get_config_l3_weights(brw->l3.config));
>> +   /* The distance between any two compatible weight vectors cannot exceed 
>> two
>> +* due to the triangle inequality.
>> +*/
>> +   const float large_dw_threshold = 2.0;
>> +   /* Somewhat arbitrary, simply makes sure that there will be no repeated
>> +* transitions to the same L3 configuration, could probably do better 
>> here.
>> +*/
>> +   const float small_dw_threshold = 0.5;
>> +   /* If we're emitting a new batch the caches should already be clean and 
>> the
>> +* transition should be relatively cheap, so it shouldn't hurt much to 
>> use
>> +* the smaller threshold.  Otherwise use the larger threshold so that we
>> +* only reprogram the L3 mid-batch if the most recently programmed
>> +* configuration is incompatible with the current pipeline state.
>> +*/
>> +   const float dw_threshold = (brw->ctx.NewDriverState & BRW_NEW_BATCH ?
>> +   small_

Re: [Mesa-dev] [PATCH v3 11/42] i965: Implement L3 state atom.

2015-11-25 Thread Matt Turner
On Wed, Nov 25, 2015 at 6:26 AM, Francisco Jerez  wrote:
> The L3 state atom calculates the target L3 partition weights when the
> program bound to some shader stage is modified, and in case they are
> far enough from the current partitioning it makes sure that the L3
> state is re-emitted.
>
> v3: Fix for inconsistent units the context URB size is expressed in.
> Clamp URB size to 1008 KB on SKL due to FF hardware limitation.
> ---
>  src/mesa/drivers/dri/i965/brw_context.h   |  6 +++
>  src/mesa/drivers/dri/i965/brw_state.h |  1 +
>  src/mesa/drivers/dri/i965/gen7_l3_state.c | 81 
> +++
>  3 files changed, 88 insertions(+)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_context.h 
> b/src/mesa/drivers/dri/i965/brw_context.h
> index 9983454..b4f8de6 100644
> --- a/src/mesa/drivers/dri/i965/brw_context.h
> +++ b/src/mesa/drivers/dri/i965/brw_context.h
> @@ -678,6 +678,8 @@ enum brw_predicate_state {
>
>  struct shader_times;
>
> +struct brw_l3_config;
> +
>  /**
>   * brw_context is derived from gl_context.
>   */
> @@ -1220,6 +1222,10 @@ struct brw_context
> int basevertex;
>
> struct {
> +  const struct brw_l3_config *config;
> +   } l3;
> +
> +   struct {
>drm_intel_bo *bo;
>const char **names;
>int *ids;
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h 
> b/src/mesa/drivers/dri/i965/brw_state.h
> index 94734ba..49f301a 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -129,6 +129,7 @@ extern const struct brw_tracked_state gen7_depthbuffer;
>  extern const struct brw_tracked_state gen7_clip_state;
>  extern const struct brw_tracked_state gen7_disable_stages;
>  extern const struct brw_tracked_state gen7_gs_state;
> +extern const struct brw_tracked_state gen7_l3_state;
>  extern const struct brw_tracked_state gen7_ps_state;
>  extern const struct brw_tracked_state gen7_push_constant_space;
>  extern const struct brw_tracked_state gen7_sbe_state;
> diff --git a/src/mesa/drivers/dri/i965/gen7_l3_state.c 
> b/src/mesa/drivers/dri/i965/gen7_l3_state.c
> index 70a61ae..4c314f5 100644
> --- a/src/mesa/drivers/dri/i965/gen7_l3_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_l3_state.c
> @@ -418,3 +418,84 @@ setup_l3_config(struct brw_context *brw, const struct 
> brw_l3_config *cfg)
>}
> }
>  }
> +
> +/**
> + * Return the unit brw_context::urb::size is expressed in, in KB.  \sa
> + * brw_device_info::urb::size.
> + */
> +static unsigned
> +get_urb_size_scale(const struct brw_device_info *devinfo)
> +{
> +   return (devinfo->gen >= 8 ? devinfo->num_slices : 1);
> +}
> +
> +/**
> + * Update the URB size in the context state for the specified L3
> + * configuration.
> + */
> +static void
> +update_urb_size(struct brw_context *brw, const struct brw_l3_config *cfg)
> +{
> +   const struct brw_device_info *devinfo = brw->intelScreen->devinfo;
> +   /* From the SKL "L3 Allocation and Programming" documentation:
> +*
> +* "URB is limited to 1008KB due to programming restrictions.  This is not
> +* a restriction of the L3 implementation, but of the FF and other 
> clients.
> +* Therefore, in a GT4 implementation it is possible for the programmed
> +* allocation of the L3 data array to provide 3*384KB=1152KB for URB, but
> +* only 1008KB of this will be used."
> +*/
> +   const unsigned max = (devinfo->gen == 9 ? 1008 : ~0);
> +   const unsigned sz =
> +  MIN2(max, cfg->n[L3P_URB] * get_l3_way_size(devinfo)) /
> +  get_urb_size_scale(devinfo);
> +
> +   if (brw->urb.size != sz) {
> +  brw->urb.size = sz;
> +  brw->ctx.NewDriverState |= BRW_NEW_URB_SIZE;
> +   }
> +}
> +
> +static void
> +emit_l3_state(struct brw_context *brw)
> +{
> +   const struct brw_l3_weights w = get_pipeline_state_l3_weights(brw);
> +   const float dw = diff_l3_weights(w, 
> get_config_l3_weights(brw->l3.config));
> +   /* The distance between any two compatible weight vectors cannot exceed 
> two
> +* due to the triangle inequality.
> +*/
> +   const float large_dw_threshold = 2.0;
> +   /* Somewhat arbitrary, simply makes sure that there will be no repeated
> +* transitions to the same L3 configuration, could probably do better 
> here.
> +*/
> +   const float small_dw_threshold = 0.5;
> +   /* If we're emitting a new batch the caches should already be clean and 
> the
> +* transition should be relatively cheap, so it shouldn't hurt much to use
> +* the smaller threshold.  Otherwise use the larger threshold so that we
> +* only reprogram the L3 mid-batch if the most recently programmed
> +* configuration is incompatible with the current pipeline state.
> +*/
> +   const float dw_threshold = (brw->ctx.NewDriverState & BRW_NEW_BATCH ?
> +   small_dw_threshold : large_dw_threshold);
> +
> +   if (dw > dw_threshold && brw->can_do_pipelined_register_writes) {
> +  const struct brw_l3_config *const cfg =