Reviewed-by: Marek Olšák <marek.ol...@amd.com> Marek
On Mon, Nov 6, 2017 at 11:23 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > From: Nicolai Hähnle <nicolai.haeh...@amd.com> > > This requires out-of-band creation of fences, and will be signaled to > the pipe_context::flush implementation by a special TC_FLUSH_ASYNC flag. > > v2: > - remove an incorrect assertion > - handle fence_server_sync for unsubmitted fences by > relying on the improved cs_add_fence_dependency > - only implement asynchronous flushes on amdgpu > --- > src/gallium/auxiliary/util/u_threaded_context.c | 96 ++++++++++++++++++- > src/gallium/auxiliary/util/u_threaded_context.h | 59 ++++++++++++ > .../auxiliary/util/u_threaded_context_calls.h | 1 + > src/gallium/drivers/radeonsi/si_fence.c | 104 > ++++++++++++++++----- > src/gallium/drivers/radeonsi/si_pipe.c | 3 + > src/gallium/drivers/radeonsi/si_pipe.h | 2 + > 6 files changed, 238 insertions(+), 27 deletions(-) > > diff --git a/src/gallium/auxiliary/util/u_threaded_context.c > b/src/gallium/auxiliary/util/u_threaded_context.c > index 24fab7f5cb6..0bb645e8522 100644 > --- a/src/gallium/auxiliary/util/u_threaded_context.c > +++ b/src/gallium/auxiliary/util/u_threaded_context.c > @@ -81,40 +81,47 @@ tc_debug_check(struct threaded_context *tc) > > static void > tc_batch_execute(void *job, int thread_index) > { > struct tc_batch *batch = job; > struct pipe_context *pipe = batch->pipe; > struct tc_call *last = &batch->call[batch->num_total_call_slots]; > > tc_batch_check(batch); > > + assert(!batch->token); > + > for (struct tc_call *iter = batch->call; iter != last; > iter += iter->num_call_slots) { > tc_assert(iter->sentinel == TC_SENTINEL); > execute_func[iter->call_id](pipe, &iter->payload); > } > > tc_batch_check(batch); > batch->num_total_call_slots = 0; > } > > static void > tc_batch_flush(struct threaded_context *tc) > { > struct tc_batch *next = &tc->batch_slots[tc->next]; > > tc_assert(next->num_total_call_slots != 0); > tc_batch_check(next); > tc_debug_check(tc); > p_atomic_add(&tc->num_offloaded_slots, next->num_total_call_slots); > > + if (next->token) { > + next->token->tc = NULL; > + tc_unflushed_batch_token_reference(&next->token, NULL); > + } > + > util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute, > NULL); > tc->last = tc->next; > tc->next = (tc->next + 1) % TC_MAX_BATCHES; > } > > /* This is the function that adds variable-sized calls into the current > * batch. It also flushes the batch if there is not enough space there. > * All other higher-level "add" functions use it. > */ > @@ -172,40 +179,63 @@ _tc_sync(struct threaded_context *tc, const char *info, > const char *func) > tc_debug_check(tc); > > /* Only wait for queued calls... */ > if (!util_queue_fence_is_signalled(&last->fence)) { > util_queue_fence_wait(&last->fence); > synced = true; > } > > tc_debug_check(tc); > > + if (next->token) { > + next->token->tc = NULL; > + tc_unflushed_batch_token_reference(&next->token, NULL); > + } > + > /* .. and execute unflushed calls directly. */ > if (next->num_total_call_slots) { > p_atomic_add(&tc->num_direct_slots, next->num_total_call_slots); > tc_batch_execute(next, 0); > synced = true; > } > > if (synced) { > p_atomic_inc(&tc->num_syncs); > > if (tc_strcmp(func, "tc_destroy") != 0) > tc_printf("sync %s %s\n", func, info); > } > > tc_debug_check(tc); > } > > #define tc_sync(tc) _tc_sync(tc, "", __func__) > #define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__) > > +/** > + * Call this from fence_finish for same-context fence waits of deferred > fences > + * that haven't been flushed yet. > + * > + * The passed pipe_context must be the one passed to > pipe_screen::fence_finish, > + * i.e., the wrapped one. > + */ > +void > +threaded_context_flush(struct pipe_context *_pipe, > + struct tc_unflushed_batch_token *token) > +{ > + struct threaded_context *tc = threaded_context(_pipe); > + > + /* This is called from the state-tracker / application thread. */ > + if (token->tc && token->tc == tc) > + tc_sync(token->tc); > +} > + > static void > tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource > *src) > { > *dst = NULL; > pipe_resource_reference(dst, src); > } > > void > threaded_resource_init(struct pipe_resource *res) > { > @@ -1775,36 +1805,94 @@ tc_create_video_buffer(struct pipe_context *_pipe, > { > unreachable("Threaded context should not be enabled for video APIs"); > return NULL; > } > > > /******************************************************************** > * draw, launch, clear, blit, copy, flush > */ > > +struct tc_flush_payload { > + struct pipe_fence_handle *fence; > + unsigned flags; > +}; > + > +static void > +tc_call_flush(struct pipe_context *pipe, union tc_payload *payload) > +{ > + struct tc_flush_payload *p = (struct tc_flush_payload *)payload; > + struct pipe_screen *screen = pipe->screen; > + > + pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags); > + screen->fence_reference(screen, &p->fence, NULL); > +} > + > static void > tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence, > unsigned flags) > { > struct threaded_context *tc = threaded_context(_pipe); > struct pipe_context *pipe = tc->pipe; > + struct pipe_screen *screen = pipe->screen; > struct threaded_query *tq, *tmp; > + bool async = flags & PIPE_FLUSH_DEFERRED; > + > + if (flags & PIPE_FLUSH_ASYNC) { > + struct tc_batch *last = &tc->batch_slots[tc->last]; > + > + /* Prefer to do the flush in the driver thread, but avoid the > inter-thread > + * communication overhead if the driver thread is currently idle and > the > + * caller is going to wait for the fence immediately anyway. > + */ > + if (!(util_queue_fence_is_signalled(&last->fence) && > + (flags & PIPE_FLUSH_HINT_FINISH))) > + async = true; > + } > + > + if (async && tc->create_fence) { > + if (fence) { > + struct tc_unflushed_batch_token *token = NULL; > + struct tc_batch *next = &tc->batch_slots[tc->next]; > + > + if (!next->token) { > + next->token = malloc(sizeof(*next->token)); > + if (!next->token) > + goto out_of_memory; > > + pipe_reference_init(&next->token->ref, 1); > + next->token->tc = tc; > + } > + > + screen->fence_reference(screen, fence, tc->create_fence(pipe, > token)); > + if (!*fence) > + goto out_of_memory; > + } > + > + struct tc_flush_payload *p = > + tc_add_struct_typed_call(tc, TC_CALL_flush, tc_flush_payload); > + p->fence = fence ? *fence : NULL; > + p->flags = flags | TC_FLUSH_ASYNC; > + > + if (!(flags & PIPE_FLUSH_DEFERRED)) > + tc_batch_flush(tc); > + return; > + } > + > +out_of_memory: > if (!(flags & PIPE_FLUSH_DEFERRED)) { > LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, > head_unflushed) { > tq->flushed = true; > LIST_DEL(&tq->head_unflushed); > } > } > > - /* TODO: deferred flushes? */ > tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" : > flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : > "normal"); > pipe->flush(pipe, fence, flags); > } > > /* This is actually variable-sized, because indirect isn't allocated if it's > * not needed. */ > struct tc_full_draw_info { > struct pipe_draw_info draw; > struct pipe_draw_indirect_info indirect; > @@ -2240,22 +2328,24 @@ tc_destroy(struct pipe_context *_pipe) > u_upload_destroy(tc->base.const_uploader); > > if (tc->base.stream_uploader) > u_upload_destroy(tc->base.stream_uploader); > > tc_sync(tc); > > if (util_queue_is_initialized(&tc->queue)) { > util_queue_destroy(&tc->queue); > > - for (unsigned i = 0; i < TC_MAX_BATCHES; i++) > + for (unsigned i = 0; i < TC_MAX_BATCHES; i++) { > util_queue_fence_destroy(&tc->batch_slots[i].fence); > + assert(!tc->batch_slots[i].token); > + } > } > > slab_destroy_child(&tc->pool_transfers); > assert(tc->batch_slots[tc->next].num_total_call_slots == 0); > pipe->destroy(pipe); > os_free_aligned(tc); > } > > static const tc_execute execute_func[TC_NUM_CALLS] = { > #define CALL(name) tc_call_##name, > @@ -2272,20 +2362,21 @@ static const tc_execute execute_func[TC_NUM_CALLS] = { > * in pipe_screen. > * \param replace_buffer callback for replacing a pipe_resource's storage > * with another pipe_resource's storage. > * \param out if successful, the threaded_context will be returned here in > * addition to the return value if "out" != NULL > */ > struct pipe_context * > threaded_context_create(struct pipe_context *pipe, > struct slab_parent_pool *parent_transfer_pool, > tc_replace_buffer_storage_func replace_buffer, > + tc_create_fence_func create_fence, > struct threaded_context **out) > { > struct threaded_context *tc; > > STATIC_ASSERT(sizeof(union tc_payload) <= 8); > STATIC_ASSERT(sizeof(struct tc_call) <= 16); > > if (!pipe) > return NULL; > > @@ -2306,20 +2397,21 @@ threaded_context_create(struct pipe_context *pipe, > assert(offsetof(struct threaded_context, batch_slots) % 16 == 0); > assert(offsetof(struct threaded_context, batch_slots[0].call) % 16 == 0); > assert(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == > 0); > assert(offsetof(struct threaded_context, batch_slots[1].call) % 16 == 0); > > /* The driver context isn't wrapped, so set its "priv" to NULL. */ > pipe->priv = NULL; > > tc->pipe = pipe; > tc->replace_buffer_storage = replace_buffer; > + tc->create_fence = create_fence; > tc->map_buffer_alignment = > pipe->screen->get_param(pipe->screen, > PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT); > tc->base.priv = pipe; /* priv points to the wrapped driver context */ > tc->base.screen = pipe->screen; > tc->base.destroy = tc_destroy; > > tc->base.stream_uploader = u_upload_clone(&tc->base, > pipe->stream_uploader); > if (pipe->stream_uploader == pipe->const_uploader) > tc->base.const_uploader = tc->base.stream_uploader; > else > diff --git a/src/gallium/auxiliary/util/u_threaded_context.h > b/src/gallium/auxiliary/util/u_threaded_context.h > index 57805ee4a1e..7642a39dc3a 100644 > --- a/src/gallium/auxiliary/util/u_threaded_context.h > +++ b/src/gallium/auxiliary/util/u_threaded_context.h > @@ -101,20 +101,43 @@ > * 3) The driver isn't allowed to do buffer invalidations by itself under any > * circumstances. This is necessary for unsychronized maps to map the > latest > * version of the buffer. (because invalidations can be queued, while > * unsychronized maps are not queued and they should return the latest > * storage after invalidation). The threaded context always sends > * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to > * indicate this. Ignoring the flag will lead to failures. > * The threaded context uses its own buffer invalidation mechanism. > * > * > + * Rules for fences > + * ---------------- > + * > + * Flushes will be executed asynchronously in the driver thread if a > + * create_fence callback is provided. This affects fence semantics as > follows. > + * > + * When the threaded context wants to perform an asynchronous flush, it will > + * use the create_fence callback to pre-create the fence from the calling > + * thread. This pre-created fence will be passed to pipe_context::flush > + * together with the TC_FLUSH_ASYNC flag. > + * > + * The callback receives the unwrapped context as a parameter, but must use > it > + * in a thread-safe way because it is called from a non-driver thread. > + * > + * If the threaded_context does not immediately flush the current batch, the > + * callback also receives a tc_unflushed_batch_token. If fence_finish is > called > + * on the returned fence in the context that created the fence, > + * threaded_context_flush must be called. > + * > + * The driver must implement pipe_context::fence_server_sync properly, since > + * the threaded context handles PIPE_FLUSH_ASYNC. > + * > + * > * Additional requirements > * ----------------------- > * > * get_query_result: > * If threaded_query::flushed == true, get_query_result should assume that > * it's called from a non-driver thread, in which case the driver > shouldn't > * use the context in an unsafe way. > * > * replace_buffer_storage: > * The driver has to implement this callback, which will be called when > @@ -153,32 +176,40 @@ > * The batches are ordered in a ring and reused once they are idle again. > * The batching is necessary for low queue/mutex overhead. > * > */ > > #ifndef U_THREADED_CONTEXT_H > #define U_THREADED_CONTEXT_H > > #include "pipe/p_context.h" > #include "pipe/p_state.h" > +#include "util/u_inlines.h" > #include "util/u_queue.h" > #include "util/u_range.h" > #include "util/slab.h" > > +struct threaded_context; > +struct tc_unflushed_batch_token; > + > /* These are transfer flags sent to drivers. */ > /* Never infer whether it's safe to use unsychronized mappings: */ > #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29) > /* Don't invalidate buffers: */ > #define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30) > /* transfer_map is called from a non-driver thread: */ > #define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31) > > +/* Custom flush flags sent to drivers. */ > +/* fence is pre-populated with a fence created by the create_fence callback > */ > +#define TC_FLUSH_ASYNC (1u << 31) > + > /* Size of the queue = number of batch slots in memory. > * - 1 batch is always idle and records new commands > * - 1 batch is being executed > * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches. > * > * Use a size as small as possible for low CPU L2 cache usage but large > enough > * so that the queue isn't stalled too often for not having enough idle batch > * slots. > */ > #define TC_MAX_BATCHES 10 > @@ -197,20 +228,22 @@ > /* Threshold for when to enqueue buffer/texture_subdata as-is. > * If the upload size is greater than this, it will do instead: > * - for buffers: DISCARD_RANGE is done by the threaded context > * - for textures: sync and call the driver directly > */ > #define TC_MAX_SUBDATA_BYTES 320 > > typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, > struct pipe_resource *dst, > struct pipe_resource *src); > +typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct > pipe_context *ctx, > + struct > tc_unflushed_batch_token *token); > > struct threaded_resource { > struct pipe_resource b; > const struct u_resource_vtbl *vtbl; > > /* Since buffer invalidations are queued, we can't use the base resource > * for unsychronized mappings. This points to the latest version of > * the buffer after the latest invalidation. It's only used for unsychro- > * nized mappings in the non-driver thread. Initially it's set to &b. > */ > @@ -280,33 +313,45 @@ union tc_payload { > #endif > > /* Each call slot should be aligned to its own size for optimal cache usage. > */ > struct ALIGN16 tc_call { > unsigned sentinel; > ushort num_call_slots; > ushort call_id; > union tc_payload payload; > }; > > +/** > + * A token representing an unflushed batch. > + * > + * See the general rules for fences for an explanation. > + */ > +struct tc_unflushed_batch_token { > + struct pipe_reference ref; > + struct threaded_context *tc; > +}; > + > struct tc_batch { > struct pipe_context *pipe; > unsigned sentinel; > unsigned num_total_call_slots; > + struct tc_unflushed_batch_token *token; > struct util_queue_fence fence; > struct tc_call call[TC_CALLS_PER_BATCH]; > }; > > struct threaded_context { > struct pipe_context base; > struct pipe_context *pipe; > struct slab_child_pool pool_transfers; > tc_replace_buffer_storage_func replace_buffer_storage; > + tc_create_fence_func create_fence; > unsigned map_buffer_alignment; > > struct list_head unflushed_queries; > > /* Counters for the HUD. */ > unsigned num_offloaded_slots; > unsigned num_direct_slots; > unsigned num_syncs; > > struct util_queue queue; > @@ -317,22 +362,27 @@ struct threaded_context { > }; > > void threaded_resource_init(struct pipe_resource *res); > void threaded_resource_deinit(struct pipe_resource *res); > struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); > > struct pipe_context * > threaded_context_create(struct pipe_context *pipe, > struct slab_parent_pool *parent_transfer_pool, > tc_replace_buffer_storage_func replace_buffer, > + tc_create_fence_func create_fence, > struct threaded_context **out); > > +void > +threaded_context_flush(struct pipe_context *_pipe, > + struct tc_unflushed_batch_token *token); > + > static inline struct threaded_context * > threaded_context(struct pipe_context *pipe) > { > return (struct threaded_context*)pipe; > } > > static inline struct threaded_resource * > threaded_resource(struct pipe_resource *res) > { > return (struct threaded_resource*)res; > @@ -343,11 +393,20 @@ threaded_query(struct pipe_query *q) > { > return (struct threaded_query*)q; > } > > static inline struct threaded_transfer * > threaded_transfer(struct pipe_transfer *transfer) > { > return (struct threaded_transfer*)transfer; > } > > +static inline void > +tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst, > + struct tc_unflushed_batch_token *src) > +{ > + if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference > *)src)) > + free(*dst); > + *dst = src; > +} > + > #endif > diff --git a/src/gallium/auxiliary/util/u_threaded_context_calls.h > b/src/gallium/auxiliary/util/u_threaded_context_calls.h > index 546819a2580..1356c54baf2 100644 > --- a/src/gallium/auxiliary/util/u_threaded_context_calls.h > +++ b/src/gallium/auxiliary/util/u_threaded_context_calls.h > @@ -1,10 +1,11 @@ > +CALL(flush) > CALL(destroy_query) > CALL(begin_query) > CALL(end_query) > CALL(get_query_result_resource) > CALL(render_condition) > CALL(bind_sampler_states) > CALL(set_framebuffer_state) > CALL(set_tess_state) > CALL(set_constant_buffer) > CALL(set_scissor_states) > diff --git a/src/gallium/drivers/radeonsi/si_fence.c > b/src/gallium/drivers/radeonsi/si_fence.c > index b416c47aa30..701e8df9cfc 100644 > --- a/src/gallium/drivers/radeonsi/si_fence.c > +++ b/src/gallium/drivers/radeonsi/si_fence.c > @@ -19,27 +19,30 @@ > * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > FROM, > * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > THE > * SOFTWARE. > * > */ > > #include <libsync.h> > > #include "util/os_time.h" > #include "util/u_memory.h" > +#include "util/u_queue.h" > > #include "si_pipe.h" > > struct si_multi_fence { > struct pipe_reference reference; > struct pipe_fence_handle *gfx; > struct pipe_fence_handle *sdma; > + struct tc_unflushed_batch_token *tc_token; > + struct util_queue_fence ready; > > /* If the context wasn't flushed at fence creation, this is non-NULL. > */ > struct { > struct r600_common_context *ctx; > unsigned ib_index; > } gfx_unflushed; > }; > > static void si_add_fence_dependency(struct r600_common_context *rctx, > struct pipe_fence_handle *fence) > @@ -55,46 +58,62 @@ static void si_fence_reference(struct pipe_screen *screen, > struct pipe_fence_handle **dst, > struct pipe_fence_handle *src) > { > struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; > struct si_multi_fence **rdst = (struct si_multi_fence **)dst; > struct si_multi_fence *rsrc = (struct si_multi_fence *)src; > > if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { > ws->fence_reference(&(*rdst)->gfx, NULL); > ws->fence_reference(&(*rdst)->sdma, NULL); > + tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL); > FREE(*rdst); > } > *rdst = rsrc; > } > > +static struct si_multi_fence *si_create_multi_fence() > +{ > + struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence); > + if (!fence) > + return NULL; > + > + pipe_reference_init(&fence->reference, 1); > + util_queue_fence_init(&fence->ready); > + > + return fence; > +} > + > +struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, > + struct tc_unflushed_batch_token > *tc_token) > +{ > + struct si_multi_fence *fence = si_create_multi_fence(); > + if (!fence) > + return NULL; > + > + util_queue_fence_reset(&fence->ready); > + tc_unflushed_batch_token_reference(&fence->tc_token, tc_token); > + > + return (struct pipe_fence_handle *)fence; > +} > + > static void si_fence_server_sync(struct pipe_context *ctx, > struct pipe_fence_handle *fence) > { > struct r600_common_context *rctx = (struct r600_common_context *)ctx; > struct si_multi_fence *rfence = (struct si_multi_fence *)fence; > > - /* Only amdgpu needs to handle fence dependencies (for fence imports). > - * radeon synchronizes all rings by default and will not implement > - * fence imports. > - */ > - if (rctx->screen->info.drm_major == 2) > - return; > + util_queue_fence_wait(&rfence->ready); > > - /* Only imported fences need to be handled by fence_server_sync, > - * because the winsys handles synchronizations automatically for BOs > - * within the process. > - * > - * Simply skip unflushed fences here, and the winsys will drop no-op > - * dependencies (i.e. dependencies within the same ring). > - */ > - if (rfence->gfx_unflushed.ctx) > + /* Unflushed fences from the same context are no-ops. */ > + if (rfence->gfx_unflushed.ctx && > + rfence->gfx_unflushed.ctx == rctx) > return; > > /* All unflushed commands will not start execution before > * this fence dependency is signalled. > * > * Should we flush the context to allow more GPU parallelism? > */ > if (rfence->sdma) > si_add_fence_dependency(rctx, rfence->sdma); > if (rfence->gfx) > @@ -107,20 +126,44 @@ static boolean si_fence_finish(struct pipe_screen > *screen, > uint64_t timeout) > { > struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; > struct si_multi_fence *rfence = (struct si_multi_fence *)fence; > struct r600_common_context *rctx; > int64_t abs_timeout = os_time_get_absolute_timeout(timeout); > > ctx = threaded_context_unwrap_sync(ctx); > rctx = ctx ? (struct r600_common_context*)ctx : NULL; > > + if (!util_queue_fence_is_signalled(&rfence->ready)) { > + if (!timeout) > + return false; > + > + if (rfence->tc_token) { > + /* Ensure that si_flush_from_st will be called for > + * this fence, but only if we're in the API thread > + * where the context is current. > + * > + * Note that the batch containing the flush may > already > + * be in flight in the driver thread, so the fence > + * may not be ready yet when this call returns. > + */ > + threaded_context_flush(ctx, rfence->tc_token); > + } > + > + if (timeout == PIPE_TIMEOUT_INFINITE) { > + util_queue_fence_wait(&rfence->ready); > + } else { > + if (!util_queue_fence_wait_timeout(&rfence->ready, > abs_timeout)) > + return false; > + } > + } > + > if (rfence->sdma) { > if (!rws->fence_wait(rws, rfence->sdma, timeout)) > return false; > > /* Recompute the timeout after waiting. */ > if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { > int64_t time = os_time_get_nano(); > timeout = abs_timeout > time ? abs_timeout - time : 0; > } > } > @@ -153,45 +196,46 @@ static void si_create_fence_fd(struct pipe_context *ctx, > { > struct r600_common_screen *rscreen = (struct > r600_common_screen*)ctx->screen; > struct radeon_winsys *ws = rscreen->ws; > struct si_multi_fence *rfence; > > *pfence = NULL; > > if (!rscreen->info.has_sync_file) > return; > > - rfence = CALLOC_STRUCT(si_multi_fence); > + rfence = si_create_multi_fence(); > if (!rfence) > return; > > - pipe_reference_init(&rfence->reference, 1); > rfence->gfx = ws->fence_import_sync_file(ws, fd); > if (!rfence->gfx) { > FREE(rfence); > return; > } > > *pfence = (struct pipe_fence_handle*)rfence; > } > > static int si_fence_get_fd(struct pipe_screen *screen, > struct pipe_fence_handle *fence) > { > struct r600_common_screen *rscreen = (struct > r600_common_screen*)screen; > struct radeon_winsys *ws = rscreen->ws; > struct si_multi_fence *rfence = (struct si_multi_fence *)fence; > int gfx_fd = -1, sdma_fd = -1; > > if (!rscreen->info.has_sync_file) > return -1; > > + util_queue_fence_wait(&rfence->ready); > + > /* Deferred fences aren't supported. */ > assert(!rfence->gfx_unflushed.ctx); > if (rfence->gfx_unflushed.ctx) > return -1; > > if (rfence->sdma) { > sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma); > if (sdma_fd == -1) > return -1; > } > @@ -253,40 +297,50 @@ static void si_flush_from_st(struct pipe_context *ctx, > fence) { > gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs); > deferred_fence = true; > } else { > rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : > NULL); > } > } > > /* Both engines can signal out of order, so we need to keep both > fences. */ > if (fence) { > - struct si_multi_fence *multi_fence = > - CALLOC_STRUCT(si_multi_fence); > - if (!multi_fence) { > - ws->fence_reference(&sdma_fence, NULL); > - ws->fence_reference(&gfx_fence, NULL); > - goto finish; > + struct si_multi_fence *multi_fence; > + > + if (flags & TC_FLUSH_ASYNC) { > + multi_fence = (struct si_multi_fence *)*fence; > + assert(multi_fence); > + } else { > + multi_fence = si_create_multi_fence(); > + if (!multi_fence) { > + ws->fence_reference(&sdma_fence, NULL); > + ws->fence_reference(&gfx_fence, NULL); > + goto finish; > + } > + > + screen->fence_reference(screen, fence, NULL); > + *fence = (struct pipe_fence_handle*)multi_fence; > } > > - multi_fence->reference.count = 1; > /* If both fences are NULL, fence_finish will always return > true. */ > multi_fence->gfx = gfx_fence; > multi_fence->sdma = sdma_fence; > > if (deferred_fence) { > multi_fence->gfx_unflushed.ctx = rctx; > multi_fence->gfx_unflushed.ib_index = > rctx->num_gfx_cs_flushes; > } > > - screen->fence_reference(screen, fence, NULL); > - *fence = (struct pipe_fence_handle*)multi_fence; > + if (flags & TC_FLUSH_ASYNC) { > + util_queue_fence_signal(&multi_fence->ready); > + > tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL); > + } > } > finish: > if (!(flags & PIPE_FLUSH_DEFERRED)) { > if (rctx->dma.cs) > ws->cs_sync_flush(rctx->dma.cs); > ws->cs_sync_flush(rctx->gfx.cs); > } > } > > void si_init_fence_functions(struct si_context *ctx) > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c > b/src/gallium/drivers/radeonsi/si_pipe.c > index 8d7fb52350f..10225353907 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.c > +++ b/src/gallium/drivers/radeonsi/si_pipe.c > @@ -398,22 +398,25 @@ static struct pipe_context > *si_pipe_create_context(struct pipe_screen *screen, > * those. > */ > if (flags & (PIPE_CONTEXT_COMPUTE_ONLY | PIPE_CONTEXT_DEBUG)) > return ctx; > > /* When shaders are logged to stderr, asynchronous compilation is > * disabled too. */ > if (sscreen->b.debug_flags & DBG_ALL_SHADERS) > return ctx; > > + /* Use asynchronous flushes only on amdgpu, since the radeon > + * implementation for fence_server_sync is incomplete. */ > return threaded_context_create(ctx, &sscreen->b.pool_transfers, > si_replace_buffer_storage, > + sscreen->b.info.drm_major >= 3 ? > si_create_fence : NULL, > &((struct si_context*)ctx)->b.tc); > } > > /* > * pipe_screen > */ > static bool si_have_tgsi_compute(struct si_screen *sscreen) > { > /* Old kernels disallowed some register writes for SI > * that are used for indirect dispatches. */ > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/drivers/radeonsi/si_pipe.h > index 5253dbc43ea..fc6197ab886 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.h > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > @@ -596,20 +596,22 @@ void si_init_debug_functions(struct si_context *sctx); > void si_check_vm_faults(struct r600_common_context *ctx, > struct radeon_saved_cs *saved, enum ring_type ring); > bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); > > /* si_dma.c */ > void si_init_dma_functions(struct si_context *sctx); > > /* si_fence.c */ > void si_init_fence_functions(struct si_context *ctx); > void si_init_screen_fence_functions(struct si_screen *screen); > +struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, > + struct tc_unflushed_batch_token > *tc_token); > > /* si_hw_context.c */ > void si_destroy_saved_cs(struct si_saved_cs *scs); > void si_context_gfx_flush(void *context, unsigned flags, > struct pipe_fence_handle **fence); > void si_begin_new_cs(struct si_context *ctx); > void si_need_cs_space(struct si_context *ctx); > > /* si_compute.c */ > void si_init_compute_functions(struct si_context *sctx); > -- > 2.11.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev