On Mon, Apr 28, 2014 at 08:42:56AM -0700, Daniel Vetter wrote: > On Mon, Apr 28, 2014 at 08:22:08AM -0700, bradley.d.vol...@intel.com wrote: > > From: Brad Volkin <bradley.d.vol...@intel.com> > > > > For clients that submit large batch buffers the command parser has > > a substantial impact on performance. On my HSW ULT system performance > > drops as much as ~20% on some tests. Most of the time is spent in the > > command lookup code. Converting that from the current naive search to > > a hash table lookup reduces the performance impact by as much as ~10%. > > > > The choice of value for I915_CMD_HASH_ORDER allows all commands > > currently used in the parser tables to hash to their own bucket (except > > for one collision on the render ring). The tradeoff is that it wastes > > memory. Because the opcodes for the commands in the tables are not > > particularly well distributed, reducing the order still leaves many > > buckets empty. The increased collisions don't seem to have a huge > > impact on the performance gain, but for now anyhow, the parser trades > > memory for performance. > > > > Signed-off-by: Brad Volkin <bradley.d.vol...@intel.com> > > Nice. One idea on top which could be worth a shot is a bloomfilter to > handle all the non-special cases without a (likely) cache miss in the > hashtable. The per-ring bloomfilter would be only loaded once (and if we > place it nearby other stuff the cmdparser needs anyway even that is > amortized).
Good suggestion. Noted. > > Also Chris mentioned that blitter loads under X are about the worst case > wrt impact of the cmdparser. Benchmarking x11perf might be a useful > extreme testcase for optimizing this. I guess Chris will jump in with more > ideas? Ok, I'll see how x11perf looks with and without this patch as a starting point. Brad > > Thanks, Daniel > > > --- > > drivers/gpu/drm/i915/i915_cmd_parser.c | 136 > > ++++++++++++++++++++++++-------- > > drivers/gpu/drm/i915/i915_drv.h | 1 + > > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > > drivers/gpu/drm/i915/intel_ringbuffer.h | 11 ++- > > 4 files changed, 116 insertions(+), 34 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c > > b/drivers/gpu/drm/i915/i915_cmd_parser.c > > index 9bac097..9dca899 100644 > > --- a/drivers/gpu/drm/i915/i915_cmd_parser.c > > +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c > > @@ -498,16 +498,18 @@ static u32 gen7_blt_get_cmd_length_mask(u32 > > cmd_header) > > return 0; > > } > > > > -static bool validate_cmds_sorted(struct intel_ring_buffer *ring) > > +static bool validate_cmds_sorted(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > { > > int i; > > bool ret = true; > > > > - if (!ring->cmd_tables || ring->cmd_table_count == 0) > > + if (!cmd_tables || cmd_table_count == 0) > > return true; > > > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_table *table = &ring->cmd_tables[i]; > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > u32 previous = 0; > > int j; > > > > @@ -557,6 +559,60 @@ static bool validate_regs_sorted(struct > > intel_ring_buffer *ring) > > ring->master_reg_count); > > } > > > > +struct cmd_node { > > + const struct drm_i915_cmd_descriptor *desc; > > + struct hlist_node node; > > +}; > > + > > +/* > > + * Different command ranges have different numbers of bits for the opcode. > > + * In order to use the opcode bits, and only the opcode bits, for the hash > > key > > + * we should use the MI_* command opcode mask (since those commands use the > > + * fewest bits for the opcode.) > > + */ > > +#define CMD_HASH_MASK STD_MI_OPCODE_MASK > > + > > +static int init_hash_table(struct intel_ring_buffer *ring, > > + const struct drm_i915_cmd_table *cmd_tables, > > + int cmd_table_count) > > +{ > > + int i, j; > > + > > + hash_init(ring->cmd_hash); > > + > > + for (i = 0; i < cmd_table_count; i++) { > > + const struct drm_i915_cmd_table *table = &cmd_tables[i]; > > + > > + for (j = 0; j < table->count; j++) { > > + const struct drm_i915_cmd_descriptor *desc = > > + &table->table[j]; > > + struct cmd_node *desc_node = > > + kmalloc(sizeof(*desc_node), GFP_KERNEL); > > + > > + if (!desc_node) > > + return -ENOMEM; > > + > > + desc_node->desc = desc; > > + hash_add(ring->cmd_hash, &desc_node->node, > > + desc->cmd.value & CMD_HASH_MASK); > > + } > > + } > > + > > + return 0; > > +} > > + > > +static void fini_hash_table(struct intel_ring_buffer *ring) > > +{ > > + struct hlist_node *tmp; > > + struct cmd_node *desc_node; > > + int i; > > + > > + hash_for_each_safe(ring->cmd_hash, i, tmp, desc_node, node) { > > + hash_del(&desc_node->node); > > + kfree(desc_node); > > + } > > +} > > + > > /** > > * i915_cmd_parser_init_ring() - set cmd parser related fields for a > > ringbuffer > > * @ring: the ringbuffer to initialize > > @@ -567,18 +623,21 @@ static bool validate_regs_sorted(struct > > intel_ring_buffer *ring) > > */ > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring) > > { > > + const struct drm_i915_cmd_table *cmd_tables; > > + int cmd_table_count; > > + > > if (!IS_GEN7(ring->dev)) > > return; > > > > switch (ring->id) { > > case RCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_render_ring_cmds; > > - ring->cmd_table_count = > > + cmd_tables = hsw_render_ring_cmds; > > + cmd_table_count = > > ARRAY_SIZE(hsw_render_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_render_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > + cmd_tables = gen7_render_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_render_cmds); > > } > > > > ring->reg_table = gen7_render_regs; > > @@ -595,17 +654,17 @@ void i915_cmd_parser_init_ring(struct > > intel_ring_buffer *ring) > > ring->get_cmd_length_mask = gen7_render_get_cmd_length_mask; > > break; > > case VCS: > > - ring->cmd_tables = gen7_video_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > + cmd_tables = gen7_video_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_video_cmds); > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > case BCS: > > if (IS_HASWELL(ring->dev)) { > > - ring->cmd_tables = hsw_blt_ring_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > + cmd_tables = hsw_blt_ring_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_blt_ring_cmds); > > } else { > > - ring->cmd_tables = gen7_blt_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > + cmd_tables = gen7_blt_cmds; > > + cmd_table_count = ARRAY_SIZE(gen7_blt_cmds); > > } > > > > ring->reg_table = gen7_blt_regs; > > @@ -622,8 +681,8 @@ void i915_cmd_parser_init_ring(struct intel_ring_buffer > > *ring) > > ring->get_cmd_length_mask = gen7_blt_get_cmd_length_mask; > > break; > > case VECS: > > - ring->cmd_tables = hsw_vebox_cmds; > > - ring->cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > + cmd_tables = hsw_vebox_cmds; > > + cmd_table_count = ARRAY_SIZE(hsw_vebox_cmds); > > /* VECS can use the same length_mask function as VCS */ > > ring->get_cmd_length_mask = gen7_bsd_get_cmd_length_mask; > > break; > > @@ -633,18 +692,38 @@ void i915_cmd_parser_init_ring(struct > > intel_ring_buffer *ring) > > BUG(); > > } > > > > - BUG_ON(!validate_cmds_sorted(ring)); > > + BUG_ON(!validate_cmds_sorted(ring, cmd_tables, cmd_table_count)); > > BUG_ON(!validate_regs_sorted(ring)); > > + > > + BUG_ON(init_hash_table(ring, cmd_tables, cmd_table_count)); > > + > > + ring->needs_cmd_parser = true; > > +} > > + > > +/** > > + * i915_cmd_parser_fini_ring() - clean up cmd parser related fields > > + * @ring: the ringbuffer to clean up > > + * > > + * Releases any resources related to command parsing that may have been > > + * initialized for the specified ring. > > + */ > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring) > > +{ > > + if (!ring->needs_cmd_parser) > > + return; > > + > > + fini_hash_table(ring); > > } > > > > static const struct drm_i915_cmd_descriptor* > > -find_cmd_in_table(const struct drm_i915_cmd_table *table, > > +find_cmd_in_table(struct intel_ring_buffer *ring, > > u32 cmd_header) > > { > > - int i; > > + struct cmd_node *desc_node; > > > > - for (i = 0; i < table->count; i++) { > > - const struct drm_i915_cmd_descriptor *desc = &table->table[i]; > > + hash_for_each_possible(ring->cmd_hash, desc_node, node, > > + cmd_header & CMD_HASH_MASK) { > > + const struct drm_i915_cmd_descriptor *desc = desc_node->desc; > > u32 masked_cmd = desc->cmd.mask & cmd_header; > > u32 masked_value = desc->cmd.value & desc->cmd.mask; > > > > @@ -668,16 +747,12 @@ find_cmd(struct intel_ring_buffer *ring, > > u32 cmd_header, > > struct drm_i915_cmd_descriptor *default_desc) > > { > > + const struct drm_i915_cmd_descriptor *desc; > > u32 mask; > > - int i; > > - > > - for (i = 0; i < ring->cmd_table_count; i++) { > > - const struct drm_i915_cmd_descriptor *desc; > > > > - desc = find_cmd_in_table(&ring->cmd_tables[i], cmd_header); > > - if (desc) > > - return desc; > > - } > > + desc = find_cmd_in_table(ring, cmd_header); > > + if (desc) > > + return desc; > > > > mask = ring->get_cmd_length_mask(cmd_header); > > if (!mask) > > @@ -748,8 +823,7 @@ bool i915_needs_cmd_parser(struct intel_ring_buffer > > *ring) > > { > > struct drm_i915_private *dev_priv = ring->dev->dev_private; > > > > - /* No command tables indicates a platform without parsing */ > > - if (!ring->cmd_tables) > > + if (!ring->needs_cmd_parser) > > return false; > > > > /* > > diff --git a/drivers/gpu/drm/i915/i915_drv.h > > b/drivers/gpu/drm/i915/i915_drv.h > > index 7d6acb4..8c8388f 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -2414,6 +2414,7 @@ const char *i915_cache_level_str(int type); > > /* i915_cmd_parser.c */ > > int i915_cmd_parser_get_version(void); > > void i915_cmd_parser_init_ring(struct intel_ring_buffer *ring); > > +void i915_cmd_parser_fini_ring(struct intel_ring_buffer *ring); > > bool i915_needs_cmd_parser(struct intel_ring_buffer *ring); > > int i915_parse_cmds(struct intel_ring_buffer *ring, > > struct drm_i915_gem_object *batch_obj, > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c > > b/drivers/gpu/drm/i915/intel_ringbuffer.c > > index eb3dd26..06fa9a3 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.c > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c > > @@ -1469,6 +1469,8 @@ void intel_cleanup_ring_buffer(struct > > intel_ring_buffer *ring) > > ring->cleanup(ring); > > > > cleanup_status_page(ring); > > + > > + i915_cmd_parser_fini_ring(ring); > > } > > > > static int intel_ring_wait_request(struct intel_ring_buffer *ring, int n) > > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h > > b/drivers/gpu/drm/i915/intel_ringbuffer.h > > index 413cdc7..48daa91 100644 > > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > > @@ -1,6 +1,10 @@ > > #ifndef _INTEL_RINGBUFFER_H_ > > #define _INTEL_RINGBUFFER_H_ > > > > +#include <linux/hashtable.h> > > + > > +#define I915_CMD_HASH_ORDER 9 > > + > > /* > > * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 "Ring Buffer Use" > > * Gen3 BSpec "vol1c Memory Interface Functions" / 2.3.4.5 "Ring Buffer > > Use" > > @@ -164,12 +168,13 @@ struct intel_ring_buffer { > > volatile u32 *cpu_page; > > } scratch; > > > > + bool needs_cmd_parser; > > + > > /* > > - * Tables of commands the command parser needs to know about > > + * Table of commands the command parser needs to know about > > * for this ring. > > */ > > - const struct drm_i915_cmd_table *cmd_tables; > > - int cmd_table_count; > > + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); > > > > /* > > * Table of registers allowed in commands that read/write registers. > > -- > > 1.8.3.2 > > > > _______________________________________________ > > Intel-gfx mailing list > > Intel-gfx@lists.freedesktop.org > > http://lists.freedesktop.org/mailman/listinfo/intel-gfx > > -- > Daniel Vetter > Software Engineer, Intel Corporation > +41 (0) 79 365 57 48 - http://blog.ffwll.ch _______________________________________________ Intel-gfx mailing list Intel-gfx@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/intel-gfx