Re: [Mesa-dev] [PATCH v2 4/6] i965: dump scheduling cycle estimates

2015-10-21 Thread Jason Ekstrand
On Fri, Oct 16, 2015 at 8:03 PM, Connor Abbott  wrote:
> The heuristic we're using is rather lame, since it assumes everything is
> non-uniform and loops execute 10 times, but it should be enough for
> measuring improvements in the scheduler that don't result in a change in
> the number of instructions.
>
> v2:
> - Switch loops and cycle counts to be compatible with older shader-db.
> - Make loop heuristic 10x to match with spilling code.
>
> Signed-off-by: Connor Abbott 
> ---
>  src/mesa/drivers/dri/i965/brw_cfg.h  |  4 
>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp   | 11 ++-
>  .../drivers/dri/i965/brw_schedule_instructions.cpp   | 20 
> 
>  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  9 +
>  4 files changed, 35 insertions(+), 9 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h 
> b/src/mesa/drivers/dri/i965/brw_cfg.h
> index a094917..d0bdb00 100644
> --- a/src/mesa/drivers/dri/i965/brw_cfg.h
> +++ b/src/mesa/drivers/dri/i965/brw_cfg.h
> @@ -90,6 +90,8 @@ struct bblock_t {
> struct exec_list parents;
> struct exec_list children;
> int num;
> +
> +   unsigned cycle_count;
>  };
>
>  static inline struct backend_instruction *
> @@ -285,6 +287,8 @@ struct cfg_t {
> int num_blocks;
>
> bool idom_dirty;
> +
> +   unsigned cycle_count;
>  };
>
>  /* Note that this is implemented with a double for loop -- break will
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> index 17e19cf..3bb0e7d 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
> @@ -2180,9 +2180,9 @@ fs_generator::generate_code(const cfg_t *cfg, int 
> dispatch_width)
>
> if (unlikely(debug_flag)) {
>fprintf(stderr, "Native code for %s\n"
> -  "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. 
> Promoted %u constants. Compacted %d to %d"
> +  "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d 
> spills:fills. Promoted %u constants. Compacted %d to %d"
>" bytes (%.0f%%)\n",
> -  shader_name, dispatch_width, before_size / 16, loop_count,
> +  shader_name, dispatch_width, before_size / 16, loop_count, 
> cfg->cycle_count,
>spill_count, fill_count, promoted_constants, before_size, 
> after_size,
>100.0f * (before_size - after_size) / before_size);
>
> @@ -2192,12 +2192,13 @@ fs_generator::generate_code(const cfg_t *cfg, int 
> dispatch_width)
> }
>
> compiler->shader_debug_log(log_data,
> -  "%s SIMD%d shader: %d inst, %d loops, "
> +  "%s SIMD%d shader: %d inst, %d loops, %u 
> cycles, "
>"%d:%d spills:fills, Promoted %u constants, "
>"compacted %d to %d bytes.\n",
>stage_abbrev, dispatch_width, before_size / 16,
> -  loop_count, spill_count, fill_count,
> -  promoted_constants, before_size, after_size);
> +  loop_count, cfg->cycle_count, spill_count,
> +  fill_count, promoted_constants, before_size,
> +  after_size);
>
> return start_offset;
>  }
> diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
> b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> index 1652261..e14d041 100644
> --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
> @@ -1467,6 +1467,24 @@ instruction_scheduler::schedule_instructions(bblock_t 
> *block)
> if (block->end()->opcode == BRW_OPCODE_NOP)
>block->end()->remove(block);
> assert(instructions_to_schedule == 0);
> +
> +   block->cycle_count = time;
> +}
> +
> +static unsigned get_cycle_count(cfg_t *cfg)
> +{
> +   unsigned count = 0, multiplier = 1;
> +   foreach_block(block, cfg) {
> +  if (block->start()->opcode == BRW_OPCODE_DO)
> + multiplier *= 10; /* assume that loops execute ~10 times */
> +
> +  count += block->cycle_count * multiplier;

Unfortunately, I don't think this properly handles "if (...) { tex }
else { tex };" and similar things where the latency isn't necessarily
additive.  However, it's a good first-order.

Reviewed-by: Jason Ekstrand 

> +
> +  if (block->end()->opcode == BRW_OPCODE_WHILE)
> + multiplier /= 10;
> +   }
> +
> +   return count;
>  }
>
>  void
> @@ -1507,6 +1525,8 @@ instruction_scheduler::run(cfg_t *cfg)
>post_reg_alloc);
>bs->dump_instructions();
> }
> +
> +   cfg->cycle_count = get_cycle_count(cfg);
>  }
>
>  void
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
> 

[Mesa-dev] [PATCH v2 4/6] i965: dump scheduling cycle estimates

2015-10-16 Thread Connor Abbott
The heuristic we're using is rather lame, since it assumes everything is
non-uniform and loops execute 10 times, but it should be enough for
measuring improvements in the scheduler that don't result in a change in
the number of instructions.

v2:
- Switch loops and cycle counts to be compatible with older shader-db.
- Make loop heuristic 10x to match with spilling code.

Signed-off-by: Connor Abbott 
---
 src/mesa/drivers/dri/i965/brw_cfg.h  |  4 
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp   | 11 ++-
 .../drivers/dri/i965/brw_schedule_instructions.cpp   | 20 
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp |  9 +
 4 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h 
b/src/mesa/drivers/dri/i965/brw_cfg.h
index a094917..d0bdb00 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -90,6 +90,8 @@ struct bblock_t {
struct exec_list parents;
struct exec_list children;
int num;
+
+   unsigned cycle_count;
 };
 
 static inline struct backend_instruction *
@@ -285,6 +287,8 @@ struct cfg_t {
int num_blocks;
 
bool idom_dirty;
+
+   unsigned cycle_count;
 };
 
 /* Note that this is implemented with a double for loop -- break will
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 17e19cf..3bb0e7d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -2180,9 +2180,9 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width)
 
if (unlikely(debug_flag)) {
   fprintf(stderr, "Native code for %s\n"
-  "SIMD%d shader: %d instructions. %d loops. %d:%d spills:fills. 
Promoted %u constants. Compacted %d to %d"
+  "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d 
spills:fills. Promoted %u constants. Compacted %d to %d"
   " bytes (%.0f%%)\n",
-  shader_name, dispatch_width, before_size / 16, loop_count,
+  shader_name, dispatch_width, before_size / 16, loop_count, 
cfg->cycle_count,
   spill_count, fill_count, promoted_constants, before_size, 
after_size,
   100.0f * (before_size - after_size) / before_size);
 
@@ -2192,12 +2192,13 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width)
}
 
compiler->shader_debug_log(log_data,
-  "%s SIMD%d shader: %d inst, %d loops, "
+  "%s SIMD%d shader: %d inst, %d loops, %u cycles, 
"
   "%d:%d spills:fills, Promoted %u constants, "
   "compacted %d to %d bytes.\n",
   stage_abbrev, dispatch_width, before_size / 16,
-  loop_count, spill_count, fill_count,
-  promoted_constants, before_size, after_size);
+  loop_count, cfg->cycle_count, spill_count,
+  fill_count, promoted_constants, before_size,
+  after_size);
 
return start_offset;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp 
b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 1652261..e14d041 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1467,6 +1467,24 @@ instruction_scheduler::schedule_instructions(bblock_t 
*block)
if (block->end()->opcode == BRW_OPCODE_NOP)
   block->end()->remove(block);
assert(instructions_to_schedule == 0);
+
+   block->cycle_count = time;
+}
+
+static unsigned get_cycle_count(cfg_t *cfg)
+{
+   unsigned count = 0, multiplier = 1;
+   foreach_block(block, cfg) {
+  if (block->start()->opcode == BRW_OPCODE_DO)
+ multiplier *= 10; /* assume that loops execute ~10 times */
+
+  count += block->cycle_count * multiplier;
+
+  if (block->end()->opcode == BRW_OPCODE_WHILE)
+ multiplier /= 10;
+   }
+
+   return count;
 }
 
 void
@@ -1507,6 +1525,8 @@ instruction_scheduler::run(cfg_t *cfg)
   post_reg_alloc);
   bs->dump_instructions();
}
+
+   cfg->cycle_count = get_cycle_count(cfg);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index dcacc90..8c926ec 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1656,10 +1656,10 @@ vec4_generator::generate_code(const cfg_t *cfg)
  fprintf(stderr, "Native code for %s program %d:\n", stage_name,
  prog->Id);
   }
-  fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. Compacted %d 
to %d"
+  fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. 
Compacted