On Mon, Jul 25, 2016 at 5:42 PM, Rob Clark <robdcl...@gmail.com> wrote:
> On Mon, Jul 25, 2016 at 11:16 AM, Brian Paul <bri...@vmware.com> wrote:
>> On 07/18/2016 07:11 AM, Marek Olšák wrote:
>>>
>>> From: Marek Olšák <marek.ol...@amd.com>
>>>
>>> The goal is to do this in st_validate_state:
>>>     while (dirty)
>>>        atoms[u_bit_scan(&dirty)]->update(st);
>>>
>>> That implies that atoms can't specify which flags they consume.
>>> There is exactly one ST_NEW_* flag for each atom. (58 flags in total)
>>>
>>> There are macros that combine multiple flags into one for easier use.
>>>
>>> All _NEW_* flags are translated into ST_NEW_* flags in
>>> st_invalidate_state.
>>> st/mesa doesn't keep the _NEW_* flags after that.
>>>
>>> torcs is 2% faster between the previous patch and the end of this series.
>>> ---
>>>   src/mesa/state_tracker/st_atom.c               | 153 +++++-------------
>>>   src/mesa/state_tracker/st_atom.h               | 210
>>> +++++++++++++++++--------
>>>   src/mesa/state_tracker/st_atom_array.c         |   4 -
>>>   src/mesa/state_tracker/st_atom_atomicbuf.c     |  24 ---
>>>   src/mesa/state_tracker/st_atom_blend.c         |   4 -
>>>   src/mesa/state_tracker/st_atom_clip.c          |   4 -
>>>   src/mesa/state_tracker/st_atom_constbuf.c      |  48 ------
>>>   src/mesa/state_tracker/st_atom_depth.c         |   4 -
>>>   src/mesa/state_tracker/st_atom_framebuffer.c   |   4 -
>>>   src/mesa/state_tracker/st_atom_image.c         |  24 ---
>>>   src/mesa/state_tracker/st_atom_list.h          |  75 +++++++++
>>>   src/mesa/state_tracker/st_atom_msaa.c          |   8 -
>>>   src/mesa/state_tracker/st_atom_pixeltransfer.c |   4 -
>>>   src/mesa/state_tracker/st_atom_rasterizer.c    |  16 --
>>>   src/mesa/state_tracker/st_atom_sampler.c       |   4 -
>>>   src/mesa/state_tracker/st_atom_scissor.c       |   8 -
>>>   src/mesa/state_tracker/st_atom_shader.c        |  24 ---
>>>   src/mesa/state_tracker/st_atom_stipple.c       |   5 -
>>>   src/mesa/state_tracker/st_atom_storagebuf.c    |  24 ---
>>>   src/mesa/state_tracker/st_atom_tess.c          |   4 -
>>>   src/mesa/state_tracker/st_atom_texture.c       |  24 ---
>>>   src/mesa/state_tracker/st_atom_viewport.c      |   4 -
>>>   src/mesa/state_tracker/st_cb_bitmap.c          |  10 +-
>>>   src/mesa/state_tracker/st_cb_bufferobjects.c   |  10 +-
>>>   src/mesa/state_tracker/st_cb_compute.c         |   2 +-
>>>   src/mesa/state_tracker/st_cb_feedback.c        |   2 +-
>>>   src/mesa/state_tracker/st_cb_program.c         |  38 ++---
>>>   src/mesa/state_tracker/st_cb_texture.c         |   2 +-
>>>   src/mesa/state_tracker/st_context.c            | 100 ++++++++++--
>>>   src/mesa/state_tracker/st_context.h            |  42 +----
>>>   src/mesa/state_tracker/st_draw.c               |   4 +-
>>>   src/mesa/state_tracker/st_manager.c            |   4 +-
>>>   32 files changed, 377 insertions(+), 516 deletions(-)
>>>   create mode 100644 src/mesa/state_tracker/st_atom_list.h
>>>
>>> diff --git a/src/mesa/state_tracker/st_atom.c
>>> b/src/mesa/state_tracker/st_atom.c
>>> index 9d5cc0f..5843d2a 100644
>>> --- a/src/mesa/state_tracker/st_atom.c
>>> +++ b/src/mesa/state_tracker/st_atom.c
>>> @@ -37,87 +37,18 @@
>>>   #include "st_manager.h"
>>>
>>>
>>> -/**
>>> - * This is used to initialize st->render_atoms[].
>>> - */
>>> -static const struct st_tracked_state *render_atoms[] =
>>> -{
>>> -   &st_update_depth_stencil_alpha,
>>> -   &st_update_clip,
>>> -
>>> -   &st_update_fp,
>>> -   &st_update_gp,
>>> -   &st_update_tep,
>>> -   &st_update_tcp,
>>> -   &st_update_vp,
>>> -
>>> -   &st_update_rasterizer,
>>> -   &st_update_polygon_stipple,
>>> -   &st_update_viewport,
>>> -   &st_update_scissor,
>>> -   &st_update_window_rectangles,
>>> -   &st_update_blend,
>>> -   &st_update_vertex_texture,
>>> -   &st_update_fragment_texture,
>>> -   &st_update_geometry_texture,
>>> -   &st_update_tessctrl_texture,
>>> -   &st_update_tesseval_texture,
>>> -   &st_update_sampler, /* depends on update_*_texture for swizzle */
>>> -   &st_bind_vs_images,
>>> -   &st_bind_tcs_images,
>>> -   &st_bind_tes_images,
>>> -   &st_bind_gs_images,
>>> -   &st_bind_fs_images,
>>> -   &st_update_framebuffer, /* depends on update_*_texture and
>>> bind_*_images */
>>> -   &st_update_msaa,
>>> -   &st_update_sample_shading,
>>> -   &st_update_vs_constants,
>>> -   &st_update_tcs_constants,
>>> -   &st_update_tes_constants,
>>> -   &st_update_gs_constants,
>>> -   &st_update_fs_constants,
>>> -   &st_bind_vs_ubos,
>>> -   &st_bind_tcs_ubos,
>>> -   &st_bind_tes_ubos,
>>> -   &st_bind_fs_ubos,
>>> -   &st_bind_gs_ubos,
>>> -   &st_bind_vs_atomics,
>>> -   &st_bind_tcs_atomics,
>>> -   &st_bind_tes_atomics,
>>> -   &st_bind_fs_atomics,
>>> -   &st_bind_gs_atomics,
>>> -   &st_bind_vs_ssbos,
>>> -   &st_bind_tcs_ssbos,
>>> -   &st_bind_tes_ssbos,
>>> -   &st_bind_fs_ssbos,
>>> -   &st_bind_gs_ssbos,
>>> -   &st_update_pixel_transfer,
>>> -   &st_update_tess,
>>> -
>>> -   /* this must be done after the vertex program update */
>>> -   &st_update_array
>>> -};
>>> -
>>> -
>>> -/**
>>> - * This is used to initialize st->compute_atoms[].
>>> - */
>>> -static const struct st_tracked_state *compute_atoms[] =
>>> +/* The list state update functions. */
>>> +static const struct st_tracked_state *atoms[] =
>>>   {
>>> -   &st_update_cp,
>>> -   &st_update_compute_texture,
>>> -   &st_update_sampler, /* depends on update_compute_texture for swizzle
>>> */
>>> -   &st_update_cs_constants,
>>> -   &st_bind_cs_ubos,
>>> -   &st_bind_cs_atomics,
>>> -   &st_bind_cs_ssbos,
>>> -   &st_bind_cs_images,
>>> +#define ST_STATE(FLAG, st_update) &st_update,
>>> +#include "st_atom_list.h"
>>> +#undef ST_STATE
>>>   };
>>>
>>>
>>>   void st_init_atoms( struct st_context *st )
>>>   {
>>> -   /* no-op */
>>> +   STATIC_ASSERT(ARRAY_SIZE(atoms) <= 64);
>>>   }
>>>
>>>
>>> @@ -127,13 +58,6 @@ void st_destroy_atoms( struct st_context *st )
>>>   }
>>>
>>>
>>> -
>>> -static bool
>>> -check_state(const struct st_state_flags *a, const struct st_state_flags
>>> *b)
>>> -{
>>> -   return (a->mesa & b->mesa) || (a->st & b->st);
>>> -}
>>> -
>>>   /* Too complex to figure out, just check every time:
>>>    */
>>>   static void check_program_state( struct st_context *st )
>>> @@ -141,13 +65,13 @@ static void check_program_state( struct st_context
>>> *st )
>>>      struct gl_context *ctx = st->ctx;
>>>
>>>      if (ctx->VertexProgram._Current != &st->vp->Base)
>>> -      st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
>>> +      st->dirty |= ST_NEW_VERTEX_PROGRAM;
>>>
>>>      if (ctx->FragmentProgram._Current != &st->fp->Base)
>>> -      st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
>>> +      st->dirty |= ST_NEW_FRAGMENT_PROGRAM;
>>>
>>>      if (ctx->GeometryProgram._Current != &st->gp->Base)
>>> -      st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
>>> +      st->dirty |= ST_NEW_GEOMETRY_PROGRAM;
>>>   }
>>>
>>>   static void check_attrib_edgeflag(struct st_context *st)
>>> @@ -165,14 +89,14 @@ static void check_attrib_edgeflag(struct st_context
>>> *st)
>>>                           arrays[VERT_ATTRIB_EDGEFLAG]->StrideB != 0;
>>>      if (vertdata_edgeflags != st->vertdata_edgeflags) {
>>>         st->vertdata_edgeflags = vertdata_edgeflags;
>>> -      st->dirty.st |= ST_NEW_VERTEX_PROGRAM;
>>> +      st->dirty |= ST_NEW_VERTEX_PROGRAM;
>>>      }
>>>
>>>      edgeflag_culls_prims = edgeflags_enabled && !vertdata_edgeflags &&
>>>
>>> !st->ctx->Current.Attrib[VERT_ATTRIB_EDGEFLAG][0];
>>>      if (edgeflag_culls_prims != st->edgeflag_culls_prims) {
>>>         st->edgeflag_culls_prims = edgeflag_culls_prims;
>>> -      st->dirty.st |= ST_NEW_RASTERIZER;
>>> +      st->dirty |= ST_NEW_RASTERIZER;
>>>      }
>>>   }
>>>
>>> @@ -183,49 +107,42 @@ static void check_attrib_edgeflag(struct st_context
>>> *st)
>>>
>>>   void st_validate_state( struct st_context *st, enum st_pipeline pipeline
>>> )
>>>   {
>>> -   const struct st_tracked_state **atoms;
>>> -   struct st_state_flags *state;
>>> -   GLuint num_atoms;
>>> -   GLuint i;
>>> +   uint64_t dirty, pipeline_mask;
>>> +   uint32_t dirty_lo, dirty_hi;
>>> +
>>> +   /* Get Mesa driver state. */
>>> +   st->dirty |= st->ctx->NewDriverState & ST_ALL_STATES_MASK;
>>> +   st->ctx->NewDriverState = 0;
>>>
>>>      /* Get pipeline state. */
>>>      switch (pipeline) {
>>> -    case ST_PIPELINE_RENDER:
>>> -      atoms     = render_atoms;
>>> -      num_atoms = ARRAY_SIZE(render_atoms);
>>> -      state     = &st->dirty;
>>> +   case ST_PIPELINE_RENDER:
>>> +      check_attrib_edgeflag(st);
>>> +      check_program_state(st);
>>> +      st_manager_validate_framebuffers(st);
>>> +
>>> +      pipeline_mask = ST_PIPELINE_RENDER_STATE_MASK;
>>>         break;
>>>      case ST_PIPELINE_COMPUTE:
>>> -      atoms     = compute_atoms;
>>> -      num_atoms = ARRAY_SIZE(compute_atoms);
>>> -      state     = &st->dirty_cp;
>>> +      pipeline_mask = ST_PIPELINE_COMPUTE_STATE_MASK;
>>>         break;
>>>      default:
>>>         unreachable("Invalid pipeline specified");
>>>      }
>>>
>>> -   /* Get Mesa driver state. */
>>> -   st->dirty.st |= st->ctx->NewDriverState;
>>> -   st->dirty_cp.st |= st->ctx->NewDriverState;
>>> -   st->ctx->NewDriverState = 0;
>>> -
>>> -   if (pipeline == ST_PIPELINE_RENDER) {
>>> -      check_attrib_edgeflag(st);
>>> -
>>> -      check_program_state(st);
>>> -
>>> -      st_manager_validate_framebuffers(st);
>>> -   }
>>> -
>>> -   if (state->st == 0 && state->mesa == 0)
>>> +   dirty = st->dirty & pipeline_mask;
>>> +   if (!dirty)
>>>         return;
>>>
>>> -   /*printf("%s %x/%x\n", __func__, state->mesa, state->st);*/
>>> +   dirty_lo = dirty;
>>> +   dirty_hi = dirty >> 32;
>>>
>>> -   for (i = 0; i < num_atoms; i++) {
>>> -      if (check_state(state, &atoms[i]->dirty))
>>> -         atoms[i]->update( st );
>>> -   }
>>> +   /* Update states. */
>>> +   while (dirty_lo)
>>> +      atoms[u_bit_scan(&dirty_lo)]->update(st);
>>> +   while (dirty_hi)
>>> +      atoms[32 + u_bit_scan(&dirty_hi)]->update(st);
>>>
>>
>> Could we just use the u_bit_scan64() function and void the hi/lo split?
>
> fwiw, we actually did discuss that on irc, but I guess no one
> summarized on email thread..
>
> Marek's concern was that would generate worse code on 32b since it
> would pull the right-shift into the loop.
>
> I'm not entirely sure if gcc would be clever enough in this case or
> not.  I guess someone needs to compare generated asm in both cases.
> And either use u_bit_scan64() if the compiler is clever enough, or add
> a comment explaining the reason.

Yeah, I added this comment before the loops:
"Don't use u_bit_scan64, it may be slower on 32-bit."

On 32-bit, ffsll is an if-then-else expression with some arithmetic
and shifting one bit to the left is another if-then-else expression.

Marek
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to