Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

Koenig, Christian Thu, 29 Nov 2018 08:24:01 -0800

Hi Marek,

you stumbled over a pretty fundamental bug in the memory management here. 
Essentially we where leaking BOs when we ran into an OOM situation. Patch to 
fix this is on the mailing list.


A second problem is that eviction doesn't seem to work when GDS BOs aren't 
idle. In other words when two applications try to use GDS at the same time they 
don't wait for each other, but rather one looses with an OOM message.

Currently investigating why this is happening,
Christian.

Am 29.11.18 um 02:29 schrieb Marek Olšák:
Hi Christian,

I just pushed the commits.

The best way to reproduce the out-of-memory errors is to run 2 instances of the 
test simultaneously:

R600_DEBUG=testgdsmm glxgears &
R600_DEBUG=testgdsmm glxgears &

It takes about 10 seconds to finish and you'll get a lot of errors.

If you run it again, all GDS allocations will fail:

R600_DEBUG=testgdsmm glxgears
amdgpu: Failed to allocate a buffer:
amdgpu:    size      : 32768 bytes
amdgpu:    alignment : 4 bytes
amdgpu:    domains   : 8

Marek

On Wed, Nov 28, 2018 at 2:16 PM Christian König 
<ckoenig.leichtzumer...@gmail.com<mailto:ckoenig.leichtzumer...@gmail.com>> 
wrote:
Are those committed yet? They don't seem to apply cleanly on master.

Christian.

Am 27.11.18 um 02:56 schrieb Marek Olšák:
> From: Marek Olšák <marek.ol...@amd.com<mailto:marek.ol...@amd.com>>
>
> ---
>   .../drivers/radeonsi/si_compute_blit.c        |  4 +-
>   src/gallium/drivers/radeonsi/si_cp_dma.c      | 49 ++++++++++---------
>   src/gallium/drivers/radeonsi/si_pipe.h        |  8 +--
>   .../drivers/radeonsi/si_test_dma_perf.c       |  3 +-
>   4 files changed, 33 insertions(+), 31 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c 
> b/src/gallium/drivers/radeonsi/si_compute_blit.c
> index 20e4f591fbb..086793637f0 100644
> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> @@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx, struct 
> pipe_resource *dst,
>                */
>               if (clear_value_size > 4 ||
>                   (clear_value_size == 4 &&
>                    offset % 4 == 0 &&
>                    (size > 32*1024 || sctx->chip_class <= VI))) {
>                       si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
>                                                   aligned_size, clear_value,
>                                                   clear_value_size, coher);
>               } else {
>                       assert(clear_value_size == 4);
> -                     si_cp_dma_clear_buffer(sctx, dst, offset,
> -                                            aligned_size, *clear_value, 
> coher,
> +                     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
> +                                            aligned_size, *clear_value, 0, 
> coher,
>                                              get_cache_policy(sctx, coher, 
> size));
>               }
>
>               offset += aligned_size;
>               size -= aligned_size;
>       }
>
>       /* Handle non-dword alignment. */
>       if (size) {
>               assert(dst);
> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
> b/src/gallium/drivers/radeonsi/si_cp_dma.c
> index 839b31b7fdf..33220d9f0fa 100644
> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
> @@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct 
> si_context *sctx)
>
>       /* make it aligned for optimal performance */
>       return max & ~(SI_CPDMA_ALIGNMENT - 1);
>   }
>
>
>   /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
>    * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, 
> src_va is a 32-bit
>    * clear value.
>    */
> -static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va,
> -                        uint64_t src_va, unsigned size, unsigned flags,
> -                        enum si_cache_policy cache_policy)
> +static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs,
> +                        uint64_t dst_va, uint64_t src_va, unsigned size,
> +                        unsigned flags, enum si_cache_policy cache_policy)
>   {
> -     struct radeon_cmdbuf *cs = sctx->gfx_cs;
>       uint32_t header = 0, command = 0;
>
>       assert(size <= cp_dma_max_byte_count(sctx));
>       assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
>
>       if (sctx->chip_class >= GFX9)
>               command |= S_414_BYTE_COUNT_GFX9(size);
>       else
>               command |= S_414_BYTE_COUNT_GFX6(size);
>
> @@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context *sctx, 
> uint64_t dst_va,
>   }
>
>   void si_cp_dma_wait_for_idle(struct si_context *sctx)
>   {
>       /* Issue a dummy DMA that copies zero bytes.
>        *
>        * The DMA engine will see that there's no work to do and skip this
>        * DMA request, however, the CP will see the sync flag and still wait
>        * for all DMAs to complete.
>        */
> -     si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
> +     si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
>   }
>
>   static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource 
> *dst,
>                             struct pipe_resource *src, unsigned byte_count,
>                             uint64_t remaining_size, unsigned user_flags,
>                             enum si_coherency coher, bool *is_first,
>                             unsigned *packet_flags)
>   {
>       /* Fast exit for a CPDMA prefetch. */
>       if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
> @@ -200,51 +199,53 @@ static void si_cp_dma_prepare(struct si_context *sctx, 
> struct pipe_resource *dst
>        */
>       if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
>           byte_count == remaining_size) {
>               *packet_flags |= CP_DMA_SYNC;
>
>               if (coher == SI_COHERENCY_SHADER)
>                       *packet_flags |= CP_DMA_PFP_SYNC_ME;
>       }
>   }
>
> -void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource 
> *dst,
> -                         uint64_t offset, uint64_t size, unsigned value,
> -                         enum si_coherency coher,
> -                         enum si_cache_policy cache_policy)
> +void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf 
> *cs,
> +                         struct pipe_resource *dst, uint64_t offset,
> +                         uint64_t size, unsigned value, unsigned user_flags,
> +                         enum si_coherency coher, enum si_cache_policy 
> cache_policy)
>   {
>       struct r600_resource *rdst = r600_resource(dst);
>       uint64_t va = (rdst ? rdst->gpu_address : 0) + offset;
>       bool is_first = true;
>
>       assert(size && size % 4 == 0);
>
>       /* Mark the buffer range of destination as valid (initialized),
>        * so that transfer_map knows it should wait for the GPU when mapping
>        * that range. */
>       if (rdst)
>               util_range_add(&rdst->valid_buffer_range, offset, offset + 
> size);
>
>       /* Flush the caches. */
> -     sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> -                    SI_CONTEXT_CS_PARTIAL_FLUSH |
> -                    si_get_flush_flags(sctx, coher, cache_policy);
> +     if (rdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
> +             sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
> +                            SI_CONTEXT_CS_PARTIAL_FLUSH |
> +                            si_get_flush_flags(sctx, coher, cache_policy);
> +     }
>
>       while (size) {
>               unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
>               unsigned dma_flags = CP_DMA_CLEAR | (rdst ? 0 : 
> CP_DMA_DST_IS_GDS);
>
> -             si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0, coher,
> -                               &is_first, &dma_flags);
> +             si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, user_flags,
> +                               coher, &is_first, &dma_flags);
>
>               /* Emit the clear packet. */
> -             si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, 
> cache_policy);
> +             si_emit_cp_dma(sctx, cs, va, value, byte_count, dma_flags, 
> cache_policy);
>
>               size -= byte_count;
>               va += byte_count;
>       }
>
>       if (rdst && cache_policy != L2_BYPASS)
>               rdst->TC_L2_dirty = true;
>
>       /* If it's not a framebuffer fast clear... */
>       if (coher == SI_COHERENCY_SHADER)
> @@ -283,21 +284,21 @@ static void si_cp_dma_realign_engine(struct si_context 
> *sctx, unsigned size,
>                       return;
>
>               si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state);
>       }
>
>       si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
>                         &sctx->scratch_buffer->b.b, size, size, user_flags,
>                         coher, is_first, &dma_flags);
>
>       va = sctx->scratch_buffer->gpu_address;
> -     si_emit_cp_dma(sctx, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags,
> +     si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, size, 
> dma_flags,
>                      cache_policy);
>   }
>
>   /**
>    * Do memcpy between buffers using CP DMA.
>    * If src or dst is NULL, it means read or write GDS, respectively.
>    *
>    * \param user_flags        bitmask of SI_CPDMA_*
>    */
>   void si_cp_dma_copy_buffer(struct si_context *sctx,
> @@ -366,37 +367,37 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
>       main_src_offset = src_offset + skipped_size;
>
>       while (size) {
>               unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
>               unsigned dma_flags = gds_flags;
>
>               si_cp_dma_prepare(sctx, dst, src, byte_count,
>                                 size + skipped_size + realign_size,
>                                 user_flags, coher, &is_first, &dma_flags);
>
> -             si_emit_cp_dma(sctx, main_dst_offset, main_src_offset,
> +             si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, 
> main_src_offset,
>                              byte_count, dma_flags, cache_policy);
>
>               size -= byte_count;
>               main_src_offset += byte_count;
>               main_dst_offset += byte_count;
>       }
>
>       /* Copy the part we skipped because src wasn't aligned. */
>       if (skipped_size) {
>               unsigned dma_flags = gds_flags;
>
>               si_cp_dma_prepare(sctx, dst, src, skipped_size,
>                                 skipped_size + realign_size, user_flags,
>                                 coher, &is_first, &dma_flags);
>
> -             si_emit_cp_dma(sctx, dst_offset, src_offset, skipped_size,
> +             si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, src_offset, 
> skipped_size,
>                              dma_flags, cache_policy);
>       }
>
>       /* Finally, realign the engine if the size wasn't aligned. */
>       if (realign_size) {
>               si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
>                                        cache_policy, &is_first);
>       }
>
>       if (dst && cache_policy != L2_BYPASS)
> @@ -546,35 +547,35 @@ void cik_emit_prefetch_L2(struct si_context *sctx, bool 
> vertex_stage_only)
>
>   void si_test_gds(struct si_context *sctx)
>   {
>       struct pipe_context *ctx = &sctx->b;
>       struct pipe_resource *src, *dst;
>       unsigned r[4] = {};
>       unsigned offset = debug_get_num_option("OFFSET", 16);
>
>       src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
>       dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16);
> -     si_cp_dma_clear_buffer(sctx, src, 0, 4, 0xabcdef01, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> -     si_cp_dma_clear_buffer(sctx, src, 4, 4, 0x23456789, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> -     si_cp_dma_clear_buffer(sctx, src, 8, 4, 0x87654321, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> -     si_cp_dma_clear_buffer(sctx, src, 12, 4, 0xfedcba98, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> -     si_cp_dma_clear_buffer(sctx, dst, 0, 16, 0xdeadbeef, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> +     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, 0, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> +     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, 0, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> +     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, 0, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> +     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, 0xfedcba98, 0, 
> SI_COHERENCY_SHADER, L2_BYPASS);
> +     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, 0xdeadbeef, 0, 
> SI_COHERENCY_SHADER, L2_BYPASS);
>
>       si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, 
> SI_COHERENCY_NONE, L2_BYPASS);
>       si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, 
> SI_COHERENCY_NONE, L2_BYPASS);
>
>       pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
>       printf("GDS copy  = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], 
> r[3],
>                       r[0] == 0xabcdef01 && r[1] == 0x23456789 &&
>                       r[2] == 0x87654321 && r[3] == 0xfedcba98 ? "pass" : 
> "fail");
>
> -     si_cp_dma_clear_buffer(sctx, NULL, offset, 16, 0xc1ea4146, 
> SI_COHERENCY_NONE, L2_BYPASS);
> +     si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, 
> 0xc1ea4146, 0, SI_COHERENCY_NONE, L2_BYPASS);
>       si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, 
> SI_COHERENCY_NONE, L2_BYPASS);
>
>       pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
>       printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], 
> r[3],
>                       r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 &&
>                       r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" : 
> "fail");
>
>       pipe_resource_reference(&src, NULL);
>       pipe_resource_reference(&dst, NULL);
>       exit(0);
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
> b/src/gallium/drivers/radeonsi/si_pipe.h
> index 023e0f0a0f9..3ec645f9c71 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -1152,24 +1152,24 @@ void si_init_compute_blit_functions(struct si_context 
> *sctx);
>   #define SI_CPDMA_SKIP_SYNC_BEFORE   (1 << 2) /* don't wait for DMA before 
> the copy (RAW hazards) */
>   #define SI_CPDMA_SKIP_GFX_SYNC              (1 << 3) /* don't flush caches 
> and don't wait for PS/CS */
>   #define SI_CPDMA_SKIP_BO_LIST_UPDATE        (1 << 4) /* don't update the BO 
> list */
>   #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
>                          SI_CPDMA_SKIP_SYNC_AFTER | \
>                          SI_CPDMA_SKIP_SYNC_BEFORE | \
>                          SI_CPDMA_SKIP_GFX_SYNC | \
>                          SI_CPDMA_SKIP_BO_LIST_UPDATE)
>
>   void si_cp_dma_wait_for_idle(struct si_context *sctx);
> -void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource 
> *dst,
> -                         uint64_t offset, uint64_t size, unsigned value,
> -                         enum si_coherency coher,
> -                         enum si_cache_policy cache_policy);
> +void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf 
> *cs,
> +                         struct pipe_resource *dst, uint64_t offset,
> +                         uint64_t size, unsigned value, unsigned user_flags,
> +                         enum si_coherency coher, enum si_cache_policy 
> cache_policy);
>   void si_cp_dma_copy_buffer(struct si_context *sctx,
>                          struct pipe_resource *dst, struct pipe_resource *src,
>                          uint64_t dst_offset, uint64_t src_offset, unsigned 
> size,
>                          unsigned user_flags, enum si_coherency coher,
>                          enum si_cache_policy cache_policy);
>   void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
> *buf,
>                             uint64_t offset, unsigned size);
>   void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
>   void si_test_gds(struct si_context *sctx);
>
> diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c 
> b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
> index 6c04720e963..657c4ebeff8 100644
> --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
> +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
> @@ -174,21 +174,22 @@ void si_test_dma_perf(struct si_screen *sscreen)
>                               for (unsigned iter = 0; iter < NUM_RUNS; 
> iter++) {
>                                       q[iter] = ctx->create_query(ctx, 
> query_type, 0);
>                                       ctx->begin_query(ctx, q[iter]);
>
>                                       if (test_cp) {
>                                               /* CP DMA */
>                                               if (is_copy) {
>                                                       
> si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0,
>                                                                             
> SI_COHERENCY_NONE, cache_policy);
>                                               } else {
> -                                                     
> si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value,
> +                                                     
> si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size,
> +                                                                            
> clear_value, 0,
>                                                                              
> SI_COHERENCY_NONE, cache_policy);
>                                               }
>                                       } else if (test_sdma) {
>                                               /* SDMA */
>                                               if (is_copy) {
>                                                       struct pipe_box box;
>                                                       u_box_1d(0, size, &box);
>                                                       sctx->dma_copy(ctx, 
> dst, 0, 0, 0, 0, src, 0, &box);
>                                               } else {
>                                                       
> si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 1/3] radeonsi: allow si_cp_dma_clear_buffer to clear GDS from any IB

Reply via email to