[dpdk-dev] [dpdk-dev,v2] Clean up rte_memcpy.h file
> -Original Message- > From: Ravi Kerur [mailto:rkerur at gmail.com] > Sent: Saturday, February 27, 2016 10:06 PM > To: Wang, Zhihong > Cc: dev at dpdk.org > Subject: Re: [dpdk-dev,v2] Clean up rte_memcpy.h file > > > > On Wed, Jan 27, 2016 at 8:18 PM, Zhihong Wang > wrote: > > Remove unnecessary type casting in functions. > > > > Tested on Ubuntu (14.04 x86_64) with "make test". > > "make test" results match the results with baseline. > > "Memcpy perf" results match the results with baseline. > > > > Signed-off-by: Ravi Kerur > > Acked-by: Stephen Hemminger > > > > --- > > .../common/include/arch/x86/rte_memcpy.h? ? ? ? ? ?| 340 +++--- > --- > >? 1 file changed, 175 insertions(+), 165 deletions(-) > > > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > > index 6a57426..839d4ec 100644 > > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > > [...] > > >? /** > > @@ -150,13 +150,16 @@ rte_mov64blocks(uint8_t *dst, const uint8_t *src, > size_t n) > >? ? ? ?__m256i ymm0, ymm1; > > > >? ? ? ?while (n >= 64) { > > -? ? ? ? ? ? ?ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 0 * 32)); > > + > > +? ? ? ? ? ? ?ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32)); > > +? ? ? ? ? ? ?ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32)); > > + > > +? ? ? ? ? ? ?_mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0); > > +? ? ? ? ? ? ?_mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1); > > + > > Any particular reason to change the order of the statements here? :) > Overall this patch looks good. > > I checked the code changes, initial code had moving ?addresses (src and dst) > and > decrement counter scattered between store and load instructions. I changed it > to > loads, followed by stores and handle address/counters increment/decrement > without changing functionality. > It's definitely okay to do this. Actually changing it or not won't affect the final output at all since gcc will optimize it while generating code. It's C code we're writing after all. But personally I prefer to keep the original order just as a comment that what's needed in the future should be calculated ASAP, and different kinds (CPU port) of instructions should be mixed together. :) Could you please rebase this patch since there has been some changes already? > >? ? ? ? ? ? ? ?n -= 64; > > -? ? ? ? ? ? ?ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 1 * 32)); > > -? ? ? ? ? ? ?src = (const uint8_t *)src + 64; > > -? ? ? ? ? ? ?_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), > ymm0); > > -? ? ? ? ? ? ?_mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), > ymm1); > > -? ? ? ? ? ? ?dst = (uint8_t *)dst + 64; > > +? ? ? ? ? ? ?src = src + 64; > > +? ? ? ? ? ? ?dst = dst + 64; > >? ? ? ?} > >? } > >
[dpdk-dev] [dpdk-dev,v2] Clean up rte_memcpy.h file
On Wed, Jan 27, 2016 at 8:18 PM, Zhihong Wang wrote: > > Remove unnecessary type casting in functions. > > > > Tested on Ubuntu (14.04 x86_64) with "make test". > > "make test" results match the results with baseline. > > "Memcpy perf" results match the results with baseline. > > > > Signed-off-by: Ravi Kerur > > Acked-by: Stephen Hemminger > > > > --- > > .../common/include/arch/x86/rte_memcpy.h | 340 > +++-- > > 1 file changed, 175 insertions(+), 165 deletions(-) > > > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > > index 6a57426..839d4ec 100644 > > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > > [...] > > > /** > > @@ -150,13 +150,16 @@ rte_mov64blocks(uint8_t *dst, const uint8_t *src, > size_t n) > > __m256i ymm0, ymm1; > > > > while (n >= 64) { > > - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 0 * 32)); > > + > > + ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32)); > > + ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32)); > > + > > + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0); > > + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1); > > + > > Any particular reason to change the order of the statements here? :) > Overall this patch looks good. > Sorry for the late response. Let me double check and get back to you, it's been a while since I did the changes. > > n -= 64; > > - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 1 * 32)); > > - src = (const uint8_t *)src + 64; > > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), > ymm0); > > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), > ymm1); > > - dst = (uint8_t *)dst + 64; > > + src = src + 64; > > + dst = dst + 64; > > } > > } > > > >
[dpdk-dev] [dpdk-dev,v2] Clean up rte_memcpy.h file
> Remove unnecessary type casting in functions. > > Tested on Ubuntu (14.04 x86_64) with "make test". > "make test" results match the results with baseline. > "Memcpy perf" results match the results with baseline. > > Signed-off-by: Ravi Kerur > Acked-by: Stephen Hemminger > > --- > .../common/include/arch/x86/rte_memcpy.h | 340 +++-- > 1 file changed, 175 insertions(+), 165 deletions(-) > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > index 6a57426..839d4ec 100644 > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h [...] > /** > @@ -150,13 +150,16 @@ rte_mov64blocks(uint8_t *dst, const uint8_t *src, > size_t n) > __m256i ymm0, ymm1; > > while (n >= 64) { > - ymm0 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 0 * 32)); > + > + ymm0 = _mm256_loadu_si256((const __m256i *)(src + 0 * 32)); > + ymm1 = _mm256_loadu_si256((const __m256i *)(src + 1 * 32)); > + > + _mm256_storeu_si256((__m256i *)(dst + 0 * 32), ymm0); > + _mm256_storeu_si256((__m256i *)(dst + 1 * 32), ymm1); > + Any particular reason to change the order of the statements here? :) Overall this patch looks good. > n -= 64; > - ymm1 = _mm256_loadu_si256((const __m256i *)((const uint8_t > *)src + 1 * 32)); > - src = (const uint8_t *)src + 64; > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 0 * 32), ymm0); > - _mm256_storeu_si256((__m256i *)((uint8_t *)dst + 1 * 32), ymm1); > - dst = (uint8_t *)dst + 64; > + src = src + 64; > + dst = dst + 64; > } > } >