Re: [Mesa-dev] [PATCH v2 4/6] i965: dump scheduling cycle estimates
On Fri, Oct 16, 2015 at 8:03 PM, Connor Abbottwrote: > The heuristic we're using is rather lame, since it assumes everything is > non-uniform and loops execute 10 times, but it should be enough for > measuring improvements in the scheduler that don't result in a change in > the number of instructions. > > v2: > - Switch loops and cycle counts to be compatible with older shader-db. > - Make loop heuristic 10x to match with spilling code. > > Signed-off-by: Connor Abbott > --- > src/mesa/drivers/dri/i965/brw_cfg.h | 4 > src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 11 ++- > .../drivers/dri/i965/brw_schedule_instructions.cpp | 20 > > src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 9 + > 4 files changed, 35 insertions(+), 9 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h > b/src/mesa/drivers/dri/i965/brw_cfg.h > index a094917..d0bdb00 100644 > --- a/src/mesa/drivers/dri/i965/brw_cfg.h > +++ b/src/mesa/drivers/dri/i965/brw_cfg.h > @@ -90,6 +90,8 @@ struct bblock_t { > struct exec_list parents; > struct exec_list children; > int num; > + > + unsigned cycle_count; > }; > > static inline struct backend_instruction * > @@ -285,6 +287,8 @@ struct cfg_t { > int num_blocks; > > bool idom_dirty; > + > + unsigned cycle_count; > }; > > /* Note that this is implemented with a double for loop -- break will > diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > index 17e19cf..3bb0e7d 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp > @@ -2180,9 +2180,9 @@ fs_generator::generate_code(const cfg_t *cfg, int > dispatch_width) > > if (unlikely(debug_flag)) { >fprintf(stderr, "Native code for %s\n" > - "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. > Promoted %u constants. Compacted %d to %d" > + "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d > spills:fills. Promoted %u constants. Compacted %d to %d" >" bytes (%.0f%%)\n", > - shader_name, dispatch_width, before_size / 16, loop_count, > + shader_name, dispatch_width, before_size / 16, loop_count, > cfg->cycle_count, >spill_count, fill_count, promoted_constants, before_size, > after_size, >100.0f * (before_size - after_size) / before_size); > > @@ -2192,12 +2192,13 @@ fs_generator::generate_code(const cfg_t *cfg, int > dispatch_width) > } > > compiler->shader_debug_log(log_data, > - "%s SIMD%d shader: %d inst, %d loops, " > + "%s SIMD%d shader: %d inst, %d loops, %u > cycles, " >"%d:%d spills:fills, Promoted %u constants, " >"compacted %d to %d bytes.\n", >stage_abbrev, dispatch_width, before_size / 16, > - loop_count, spill_count, fill_count, > - promoted_constants, before_size, after_size); > + loop_count, cfg->cycle_count, spill_count, > + fill_count, promoted_constants, before_size, > + after_size); > > return start_offset; > } > diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > index 1652261..e14d041 100644 > --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp > @@ -1467,6 +1467,24 @@ instruction_scheduler::schedule_instructions(bblock_t > *block) > if (block->end()->opcode == BRW_OPCODE_NOP) >block->end()->remove(block); > assert(instructions_to_schedule == 0); > + > + block->cycle_count = time; > +} > + > +static unsigned get_cycle_count(cfg_t *cfg) > +{ > + unsigned count = 0, multiplier = 1; > + foreach_block(block, cfg) { > + if (block->start()->opcode == BRW_OPCODE_DO) > + multiplier *= 10; /* assume that loops execute ~10 times */ > + > + count += block->cycle_count * multiplier; Unfortunately, I don't think this properly handles "if (...) { tex } else { tex };" and similar things where the latency isn't necessarily additive. However, it's a good first-order. Reviewed-by: Jason Ekstrand > + > + if (block->end()->opcode == BRW_OPCODE_WHILE) > + multiplier /= 10; > + } > + > + return count; > } > > void > @@ -1507,6 +1525,8 @@ instruction_scheduler::run(cfg_t *cfg) >post_reg_alloc); >bs->dump_instructions(); > } > + > + cfg->cycle_count = get_cycle_count(cfg); > } > > void > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp >
[Mesa-dev] [PATCH v2 4/6] i965: dump scheduling cycle estimates
The heuristic we're using is rather lame, since it assumes everything is non-uniform and loops execute 10 times, but it should be enough for measuring improvements in the scheduler that don't result in a change in the number of instructions. v2: - Switch loops and cycle counts to be compatible with older shader-db. - Make loop heuristic 10x to match with spilling code. Signed-off-by: Connor Abbott--- src/mesa/drivers/dri/i965/brw_cfg.h | 4 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 11 ++- .../drivers/dri/i965/brw_schedule_instructions.cpp | 20 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 9 + 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h index a094917..d0bdb00 100644 --- a/src/mesa/drivers/dri/i965/brw_cfg.h +++ b/src/mesa/drivers/dri/i965/brw_cfg.h @@ -90,6 +90,8 @@ struct bblock_t { struct exec_list parents; struct exec_list children; int num; + + unsigned cycle_count; }; static inline struct backend_instruction * @@ -285,6 +287,8 @@ struct cfg_t { int num_blocks; bool idom_dirty; + + unsigned cycle_count; }; /* Note that this is implemented with a double for loop -- break will diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 17e19cf..3bb0e7d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -2180,9 +2180,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) if (unlikely(debug_flag)) { fprintf(stderr, "Native code for %s\n" - "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" + "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d" " bytes (%.0f%%)\n", - shader_name, dispatch_width, before_size / 16, loop_count, + shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count, spill_count, fill_count, promoted_constants, before_size, after_size, 100.0f * (before_size - after_size) / before_size); @@ -2192,12 +2192,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) } compiler->shader_debug_log(log_data, - "%s SIMD%d shader: %d inst, %d loops, " + "%s SIMD%d shader: %d inst, %d loops, %u cycles, " "%d:%d spills:fills, Promoted %u constants, " "compacted %d to %d bytes.\n", stage_abbrev, dispatch_width, before_size / 16, - loop_count, spill_count, fill_count, - promoted_constants, before_size, after_size); + loop_count, cfg->cycle_count, spill_count, + fill_count, promoted_constants, before_size, + after_size); return start_offset; } diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 1652261..e14d041 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -1467,6 +1467,24 @@ instruction_scheduler::schedule_instructions(bblock_t *block) if (block->end()->opcode == BRW_OPCODE_NOP) block->end()->remove(block); assert(instructions_to_schedule == 0); + + block->cycle_count = time; +} + +static unsigned get_cycle_count(cfg_t *cfg) +{ + unsigned count = 0, multiplier = 1; + foreach_block(block, cfg) { + if (block->start()->opcode == BRW_OPCODE_DO) + multiplier *= 10; /* assume that loops execute ~10 times */ + + count += block->cycle_count * multiplier; + + if (block->end()->opcode == BRW_OPCODE_WHILE) + multiplier /= 10; + } + + return count; } void @@ -1507,6 +1525,8 @@ instruction_scheduler::run(cfg_t *cfg) post_reg_alloc); bs->dump_instructions(); } + + cfg->cycle_count = get_cycle_count(cfg); } void diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index dcacc90..8c926ec 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1656,10 +1656,10 @@ vec4_generator::generate_code(const cfg_t *cfg) fprintf(stderr, "Native code for %s program %d:\n", stage_name, prog->Id); } - fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d to %d" + fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. Compacted