On Mon, Apr 28, 2014 at 08:42:56AM -0700, Daniel Vetter wrote:
> On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.vol...@intel.com wrote:
> > From: Brad Volkin <bradley.d.vol...@intel.com>
> > 
> > For clients that submit large batch buffers the command parser has
> > a substantial impact on performance. On my HSW ULT system performance
> > drops as much as ~20% on some tests. Most of the time is spent in the
> > command lookup code. Converting that from the current naive search to
> > a hash table lookup reduces the performance impact by as much as ~10%.
> > 
> > The choice of value for I915_CMD_HASH_ORDER allows all commands
> > currently used in the parser tables to hash to their own bucket (except
> > for one collision on the render ring). The tradeoff is that it wastes
> > memory. Because the opcodes for the commands in the tables are not
> > particularly well distributed, reducing the order still leaves many
> > buckets empty. The increased collisions don't seem to have a huge
> > impact on the performance gain, but for now anyhow, the parser trades
> > memory for performance.
> > 
> > Signed-off-by: Brad Volkin <bradley.d.vol...@intel.com>
> 
> Nice. One idea on top which could be worth a shot is a bloomfilter to
> handle all the non-special cases without a (likely) cache miss in the
> hashtable. The per-ring bloomfilter would be only loaded once (and if we
> place it nearby other stuff the cmdparser needs anyway even that is
> amortized).

Good suggestion. Noted.

> 
> Also Chris mentioned that blitter loads under X are about the worst case
> wrt impact of the cmdparser. Benchmarking x11perf might be a useful
> extreme testcase for optimizing this. I guess Chris will jump in with more
> ideas?

Ok, I'll see how x11perf looks with and without this patch as a starting
point.

Brad

> 
> Thanks, Daniel
> 
> > ---
> >  drivers/gpu/drm/i915/i915_cmd_parser.c  | 136 
> > ++++++++++++++++++++++++--------
> >  drivers/gpu/drm/i915/i915_drv.h         |   1 +
> >  drivers/gpu/drm/i915/intel_ringbuffer.c |   2 +
> >  drivers/gpu/drm/i915/intel_ringbuffer.h |  11 ++-
> >  4 files changed, 116 insertions(+), 34 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c 
> > b/drivers/gpu/drm/i915/i915_cmd_parser.c
> > index 9bac097..9dca899 100644
> > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 
> > cmd_header)
> >     return 0;
> >  }
> >  
> > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring)
> > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring,
> > +                            const struct drm_i915_cmd_table *cmd_tables,
> > +                            int cmd_table_count)
> >  {
> >     int i;
> >     bool ret = true;
> >  
> > -   if (!ring->cmd_tables || ring->cmd_table_count == 0)
> > +   if (!cmd_tables || cmd_table_count == 0)
> >             return true;
> >  
> > -   for (i = 0; i < ring->cmd_table_count; i++) {
> > -           const struct drm_i915_cmd_table *table = &ring->cmd_tables[i];
> > +   for (i = 0; i < cmd_table_count; i++) {
> > +           const struct drm_i915_cmd_table *table = &cmd_tables[i];
> >             u32 previous = 0;
> >             int j;
> >  
> > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct 
> > intel_ring_buffer *ring)
> >                          ring->master_reg_count);
> >  }
> >  
> > +struct cmd_node {
> > +   const struct drm_i915_cmd_descriptor *desc;
> > +   struct hlist_node node;
> > +};
> > +
> > +/*
> > + * Different command ranges have different numbers of bits for the opcode.
> > + * In order to use the opcode bits, and only the opcode bits, for the hash 
> > key
> > + * we should use the MI_* command opcode mask (since those commands use the
> > + * fewest bits for the opcode.)
> > + */
> > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK
> > +
> > +static int init_hash_table(struct intel_ring_buffer *ring,
> > +                      const struct drm_i915_cmd_table *cmd_tables,
> > +                      int cmd_table_count)
> > +{
> > +   int i, j;
> > +
> > +   hash_init(ring->cmd_hash);
> > +
> > +   for (i = 0; i < cmd_table_count; i++) {
> > +           const struct drm_i915_cmd_table *table = &cmd_tables[i];
> > +
> > +           for (j = 0; j < table->count; j++) {
> > +                   const struct drm_i915_cmd_descriptor *desc =
> > +                           &table->table[j];
> > +                   struct cmd_node *desc_node =
> > +                           kmalloc(sizeof(*desc_node), GFP_KERNEL);
> > +
> > +                   if (!desc_node)
> > +                           return -ENOMEM;
> > +
> > +                   desc_node->desc = desc;
> > +                   hash_add(ring->cmd_hash, &desc_node->node,
> > +                            desc->cmd.value & CMD_HASH_MASK);
> > +           }
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> > +static void fini_hash_table(struct intel_ring_buffer *ring)
> > +{
> > +   struct hlist_node *tmp;
> > +   struct cmd_node *desc_node;
> > +   int i;
> > +
> > +   hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) {
> > +           hash_del(&desc_node->node);
> > +           kfree(desc_node);
> > +   }
> > +}
> > +
> >  /**
> >   * i915_cmd_parser_init_ring() - set cmd parser related fields for a 
> > ringbuffer
> >   * @ring: the ringbuffer to initialize
> > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct 
> > intel_ring_buffer *ring)
> >   */
> >  void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring)
> >  {
> > +   const struct drm_i915_cmd_table *cmd_tables;
> > +   int cmd_table_count;
> > +
> >     if (!IS_GEN7(ring->dev))
> >             return;
> >  
> >     switch (ring->id) {
> >     case RCS:
> >             if (IS_HASWELL(ring->dev)) {
> > -                   ring->cmd_tables = hsw_render_ring_cmds;
> > -                   ring->cmd_table_count =
> > +                   cmd_tables = hsw_render_ring_cmds;
> > +                   cmd_table_count =
> >                             ARRAY_SIZE(hsw_render_ring_cmds);
> >             } else {
> > -                   ring->cmd_tables = gen7_render_cmds;
> > -                   ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds);
> > +                   cmd_tables = gen7_render_cmds;
> > +                   cmd_table_count = ARRAY_SIZE(gen7_render_cmds);
> >             }
> >  
> >             ring->reg_table = gen7_render_regs;
> > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct 
> > intel_ring_buffer *ring)
> >             ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask;
> >             break;
> >     case VCS:
> > -           ring->cmd_tables = gen7_video_cmds;
> > -           ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds);
> > +           cmd_tables = gen7_video_cmds;
> > +           cmd_table_count = ARRAY_SIZE(gen7_video_cmds);
> >             ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
> >             break;
> >     case BCS:
> >             if (IS_HASWELL(ring->dev)) {
> > -                   ring->cmd_tables = hsw_blt_ring_cmds;
> > -                   ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds);
> > +                   cmd_tables = hsw_blt_ring_cmds;
> > +                   cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds);
> >             } else {
> > -                   ring->cmd_tables = gen7_blt_cmds;
> > -                   ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds);
> > +                   cmd_tables = gen7_blt_cmds;
> > +                   cmd_table_count = ARRAY_SIZE(gen7_blt_cmds);
> >             }
> >  
> >             ring->reg_table = gen7_blt_regs;
> > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer 
> > *ring)
> >             ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask;
> >             break;
> >     case VECS:
> > -           ring->cmd_tables = hsw_vebox_cmds;
> > -           ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds);
> > +           cmd_tables = hsw_vebox_cmds;
> > +           cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds);
> >             /* VECS can use the same length_mask function as VCS */
> >             ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask;
> >             break;
> > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct 
> > intel_ring_buffer *ring)
> >             BUG();
> >     }
> >  
> > -   BUG_ON(!validate_cmds_sorted(ring));
> > +   BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count));
> >     BUG_ON(!validate_regs_sorted(ring));
> > +
> > +   BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count));
> > +
> > +   ring->needs_cmd_parser = true;
> > +}
> > +
> > +/**
> > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields
> > + * @ring: the ringbuffer to clean up
> > + *
> > + * Releases any resources related to command parsing that may have been
> > + * initialized for the specified ring.
> > + */
> > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring)
> > +{
> > +   if (!ring->needs_cmd_parser)
> > +           return;
> > +
> > +   fini_hash_table(ring);
> >  }
> >  
> >  static const struct drm_i915_cmd_descriptor*
> > -find_cmd_in_table(const struct drm_i915_cmd_table *table,
> > +find_cmd_in_table(struct intel_ring_buffer *ring,
> >               u32 cmd_header)
> >  {
> > -   int i;
> > +   struct cmd_node *desc_node;
> >  
> > -   for (i = 0; i < table->count; i++) {
> > -           const struct drm_i915_cmd_descriptor *desc = &table->table[i];
> > +   hash_for_each_possible(ring->cmd_hash, desc_node, node,
> > +                          cmd_header & CMD_HASH_MASK) {
> > +           const struct drm_i915_cmd_descriptor *desc = desc_node->desc;
> >             u32 masked_cmd = desc->cmd.mask & cmd_header;
> >             u32 masked_value = desc->cmd.value & desc->cmd.mask;
> >  
> > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring,
> >      u32 cmd_header,
> >      struct drm_i915_cmd_descriptor *default_desc)
> >  {
> > +   const struct drm_i915_cmd_descriptor *desc;
> >     u32 mask;
> > -   int i;
> > -
> > -   for (i = 0; i < ring->cmd_table_count; i++) {
> > -           const struct drm_i915_cmd_descriptor *desc;
> >  
> > -           desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header);
> > -           if (desc)
> > -                   return desc;
> > -   }
> > +   desc = find_cmd_in_table(ring, cmd_header);
> > +   if (desc)
> > +           return desc;
> >  
> >     mask = ring->get_cmd_length_mask(cmd_header);
> >     if (!mask)
> > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer 
> > *ring)
> >  {
> >     struct drm_i915_private *dev_priv = ring->dev->dev_private;
> >  
> > -   /* No command tables indicates a platform without parsing */
> > -   if (!ring->cmd_tables)
> > +   if (!ring->needs_cmd_parser)
> >             return false;
> >  
> >     /*
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h 
> > b/drivers/gpu/drm/i915/i915_drv.h
> > index 7d6acb4..8c8388f 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type);
> >  /* i915_cmd_parser.c */
> >  int i915_cmd_parser_get_version(void);
> >  void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring);
> > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring);
> >  bool i915_needs_cmd_parser(struct intel_ring_buffer *ring);
> >  int i915_parse_cmds(struct intel_ring_buffer *ring,
> >                 struct drm_i915_gem_object *batch_obj,
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c 
> > b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > index eb3dd26..06fa9a3 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct 
> > intel_ring_buffer *ring)
> >             ring->cleanup(ring);
> >  
> >     cleanup_status_page(ring);
> > +
> > +   i915_cmd_parser_fini_ring(ring);
> >  }
> >  
> >  static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n)
> > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h 
> > b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > index 413cdc7..48daa91 100644
> > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> > @@ -1,6 +1,10 @@
> >  #ifndef _INTEL_RINGBUFFER_H_
> >  #define _INTEL_RINGBUFFER_H_
> >  
> > +#include <linux/hashtable.h>
> > +
> > +#define I915_CMD_HASH_ORDER 9
> > +
> >  /*
> >   * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use"
> >   * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer 
> > Use"
> > @@ -164,12 +168,13 @@ struct  intel_ring_buffer {
> >             volatile u32 *cpu_page;
> >     } scratch;
> >  
> > +   bool needs_cmd_parser;
> > +
> >     /*
> > -    * Tables of commands the command parser needs to know about
> > +    * Table of commands the command parser needs to know about
> >      * for this ring.
> >      */
> > -   const struct drm_i915_cmd_table *cmd_tables;
> > -   int cmd_table_count;
> > +   DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER);
> >  
> >     /*
> >      * Table of registers allowed in commands that read/write registers.
> > -- 
> > 1.8.3.2
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> +41 (0) 79 365 57 48 - http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to