> -----Original Message----- > From: Tuikov, Luben <luben.tui...@amd.com> > Sent: Wednesday, December 18, 2019 6:17 PM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander <alexander.deuc...@amd.com>; Koenig, Christian > <christian.koe...@amd.com> > Subject: Re: [PATCH] drm/amdgpu: simplify padding calculations (v3) > > This patch seems to have been dropped after posting and reposting. I'm not > sure why it keeps being dropped.
Dropped from what? Once it's been reviewed, add the RBs and go ahead and push it to amd-staging-drm-next. Alex > > In v3, I added an explanation in the commit text below the original commit > text of the single sentence of "Simplify padding calculations." Identical > diffstat as v2. > > Regards, > Luben > > On 2019-12-18 6:12 p.m., Luben Tuikov wrote: > > Simplify padding calculations. > > > > 1. Taking the remainder of a binary value when the divisor is a power > > of two, such as, a % 2^n is identical to, a & (2^n - 1) in base-2 > > arithmetic, and so expressions like this: > > > > (12 - (lower_32_bits(ring->wptr) & 7)) % 8 > > > > are superflous. And can be reduced to a single remainder-taking > > operation. > > > > 2. Subtracting the remainder modulo 8 out of 12 and then again taking > > the remainder modulo 8, yields the same result as simply subtracting > > the value out of 4 and then taking the remainder. > > This is true because 4 belongs to the congruence > > (residue) class {4 + 8*k}, k in Z. (So using, {..., -12, -4, 4, 12, > > 20, ...}, would yield the same final result. > > > > So the above expression, becomes, > > > > (4 - lower_32_bits(ring->wptr)) & 7 > > > > 3. Now since 8 is part of the conguence class > > {0 + 8*k}, k in Z, and from 1) and 2) above, then, > > > > (8 - (ib->length_dw & 0x7)) % 8 > > > > becomes, simply, > > > > (-ib->length_dw) & 7. > > > > This patch implements all of the above in this code. > > > > v2: Comment update and spacing. > > v3: Add 1, 2, 3, above, in this commit message. > > > > Signed-off-by: Luben Tuikov <luben.tui...@amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 4 ++-- > > drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c | 4 ++-- > > drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 4 ++-- > > drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 4 ++-- > > drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 17 ++++++++++++----- > > 5 files changed, 20 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c > > b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c > > index 1f22a8d0f7f3..4274ccf765de 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c > > +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c > > @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct > amdgpu_ring *ring, > > u32 extra_bits = vmid & 0xf; > > > > /* IB packet must end on a 8 DW boundary */ > > - cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & > 7)) % 8); > > + cik_sdma_ring_insert_nop(ring, (4 - lower_32_bits(ring->wptr)) & 7); > > > > amdgpu_ring_write(ring, > SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits)); > > amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must > be > > 32 byte aligned */ @@ -811,7 +811,7 @@ static void > cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib) > > u32 pad_count; > > int i; > > > > - pad_count = (8 - (ib->length_dw & 0x7)) % 8; > > + pad_count = (-ib->length_dw) & 7; > > for (i = 0; i < pad_count; i++) > > if (sdma && sdma->burst_nop && (i == 0)) > > ib->ptr[ib->length_dw++] = > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c > > b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c > > index 606b621145a1..fd7fa6082563 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c > > @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct > amdgpu_ring *ring, > > unsigned vmid = AMDGPU_JOB_GET_VMID(job); > > > > /* IB packet must end on a 8 DW boundary */ > > - sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & > 7)) % 8); > > + sdma_v2_4_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & > > +7); > > > > amdgpu_ring_write(ring, > SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | > > SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); > @@ -750,7 +750,7 @@ > > static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct > amdgpu_ib *ib > > u32 pad_count; > > int i; > > > > - pad_count = (8 - (ib->length_dw & 0x7)) % 8; > > + pad_count = (-ib->length_dw) & 7; > > for (i = 0; i < pad_count; i++) > > if (sdma && sdma->burst_nop && (i == 0)) > > ib->ptr[ib->length_dw++] = > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c > > b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c > > index a559573ec8fd..4a8a7f0f3a9c 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c > > @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct > amdgpu_ring *ring, > > unsigned vmid = AMDGPU_JOB_GET_VMID(job); > > > > /* IB packet must end on a 8 DW boundary */ > > - sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & > 7)) % 8); > > + sdma_v3_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & > > +7); > > > > amdgpu_ring_write(ring, > SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | > > SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); > @@ -1021,7 +1021,7 > > @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct > amdgpu_ib *ib > > u32 pad_count; > > int i; > > > > - pad_count = (8 - (ib->length_dw & 0x7)) % 8; > > + pad_count = (-ib->length_dw) & 7; > > for (i = 0; i < pad_count; i++) > > if (sdma && sdma->burst_nop && (i == 0)) > > ib->ptr[ib->length_dw++] = > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c > > b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c > > index bd9ed33bab43..c69df0cb21ec 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c > > @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct > amdgpu_ring *ring, > > unsigned vmid = AMDGPU_JOB_GET_VMID(job); > > > > /* IB packet must end on a 8 DW boundary */ > > - sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & > 7)) % 8); > > + sdma_v4_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & > > +7); > > > > amdgpu_ring_write(ring, > SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | > > SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); > @@ -1579,7 +1579,7 > > @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct > amdgpu_ib *ib > > u32 pad_count; > > int i; > > > > - pad_count = (8 - (ib->length_dw & 0x7)) % 8; > > + pad_count = (-ib->length_dw) & 7; > > for (i = 0; i < pad_count; i++) > > if (sdma && sdma->burst_nop && (i == 0)) > > ib->ptr[ib->length_dw++] = > > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > > b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > > index 119364293cec..3912937f878f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c > > @@ -382,8 +382,15 @@ static void sdma_v5_0_ring_emit_ib(struct > amdgpu_ring *ring, > > unsigned vmid = AMDGPU_JOB_GET_VMID(job); > > uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, > vmid); > > > > - /* IB packet must end on a 8 DW boundary */ > > - sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & > 7)) % 8); > > + /* An IB packet must end on a 8 DW boundary--the next dword > > + * must be on a 8-dword boundary. Our IB packet below is 6 > > + * dwords long, thus add x number of NOPs, such that, in > > + * modular arithmetic, > > + * wptr + 6 + x = 8k, k >= 0, which in C is, > > + * (wptr + 6 + x) % 8 = 0. > > + * The expression below, is a solution of x. > > + */ > > + sdma_v5_0_ring_insert_nop(ring, (2 - lower_32_bits(ring->wptr)) & > > +7); > > > > amdgpu_ring_write(ring, > SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | > > SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); > @@ -1076,10 +1083,10 > > @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib, } > > > > /** > > - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw > > - * > > + * sdma_v5_0_ring_pad_ib - pad the IB > > * @ib: indirect buffer to fill with padding > > * > > + * Pad the IB with NOPs to a boundary multiple of 8. > > */ > > static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct > > amdgpu_ib *ib) { @@ -1087,7 +1094,7 @@ static void > > sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib > > u32 pad_count; > > int i; > > > > - pad_count = (8 - (ib->length_dw & 0x7)) % 8; > > + pad_count = (-ib->length_dw) & 0x7; > > for (i = 0; i < pad_count; i++) > > if (sdma && sdma->burst_nop && (i == 0)) > > ib->ptr[ib->length_dw++] = > > _______________________________________________ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx