How well tested is this? I suspect nothing hits the 2Mb limit, but do we have any tests or apps that even hit the 12k limit?
On 06/16/2016 07:58 PM, Kenneth Graunke wrote: > Ivybridge and Baytrail have a pretty harsh limit of 12kB scratch space > per thread. However, we can exceed this limit with some clever tricks > and gain effectively unlimited scratch space. > > Later platforms have a 2MB limit, which is much more reasonable, but > we may as well apply the same trick there. > > We can probably extend this trick to other stages, but would need to > adjust the shader code for the different thread payload layouts. > > Cc: "12.0" <mesa-sta...@lists.freedesktop.org> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=96505 > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/brw_fs.cpp | 50 > +++++++++++++++++++++++-------- > src/mesa/drivers/dri/i965/gen7_cs_state.c | 6 ++-- > 2 files changed, 41 insertions(+), 15 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index cadf905..9075918 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -5993,19 +5993,45 @@ fs_visitor::allocate_registers(bool allow_spilling) > prog_data->total_scratch = ALIGN(last_scratch, 1024); > max_scratch_size = 12 * 1024; > } > - } > > - /* We currently only support up to 2MB of scratch space. If we > - * need to support more eventually, the documentation suggests > - * that we could allocate a larger buffer, and partition it out > - * ourselves. We'd just have to undo the hardware's address > - * calculation by subtracting (FFTID * Per Thread Scratch Space) > - * and then add FFTID * (Larger Per Thread Scratch Space). > - * > - * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > > - * Thread Group Tracking > Local Memory/Scratch Space. > - */ > - assert(prog_data->total_scratch < max_scratch_size); > + if (prog_data->total_scratch >= max_scratch_size) { > + /* Normally, the hardware computes a pointer to the scratch > + * space region for our thread for us. This comes with a > + * limitation - each thread can only use a limited amount > + * of scratch space. To support more than that limit, we > + * can subtract the hardware's offset and add our own: > + * > + * Scratch Space Pointer += > + * FFTID * (prog_data->total_scratch - max_scratch_size); > + * > + * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > > + * Thread Group Tracking > Local Memory/Scratch Space. > + */ > + const fs_builder bld = > + fs_builder(this, cfg->blocks[0], (fs_inst *) > + cfg->blocks[0]->start()).exec_all(); > + struct brw_reg g0_5 = > + retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_D); > + struct brw_reg g127 = > + retype(brw_vec1_grf(127, 0), BRW_REGISTER_TYPE_D); > + > + /* Multiply FFTID by max_scratch_size and shift the result > + * left by 10. Then subtract that from the scratch pointer. > + * > + * We need a register for temporary storage, but can't allocate > + * one post-register-allocation. However, since our code will > + * be emitted at the top of the program, we can safely use g127, > + * as it isn't part of the thread payload and can't be in use. > + */ > + bld.AND(g127, g0_5, brw_imm_d(0x3ff)); > + bld.MUL(g127, g127, > + brw_imm_d(prog_data->total_scratch - max_scratch_size)); > + bld.ADD(g0_5, g0_5, g127); > + } > + } else { > + /* We should extend the above hack for other stages someday. */ > + assert(prog_data->total_scratch < max_scratch_size); > + } > } > } > > diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c > b/src/mesa/drivers/dri/i965/gen7_cs_state.c > index 5fb8829..b89a186 100644 > --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c > +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c > @@ -70,21 +70,21 @@ brw_upload_cs_state(struct brw_context *brw) > */ > OUT_RELOC64(stage_state->scratch_bo, > I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, > - ffs(stage_state->per_thread_scratch) - 11); > + MIN2(ffs(stage_state->per_thread_scratch) - 11, 11)); > } else if (brw->is_haswell) { > /* Haswell's Per Thread Scratch Space is in the range [0, 10] > * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. > */ > OUT_RELOC(stage_state->scratch_bo, > I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, > - ffs(stage_state->per_thread_scratch) - 12); > + MIN2(ffs(stage_state->per_thread_scratch) - 12, 10)); > } else { > /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] > * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. > */ > OUT_RELOC(stage_state->scratch_bo, > I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, > - stage_state->per_thread_scratch / 1024 - 1); > + MIN2(stage_state->per_thread_scratch / 1024, 12) - 1); > } > } else { > OUT_BATCH(0); > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev