Hi Jianan, Here are the LZ4 decompression times I got running "time unlz4 ..." on Rock 4 SE with RK3399 CPU.
v2024.04: 1.329 seconds v2024.04 with 26c7fdadcb7 ("lib/lz4: update LZ4 decompressor module") reverted: 0.665 seconds v2024.04 with your patch: 1.216 seconds I managed to get better performance by optimizing it myself. v2024.04 with my patch: 0.785 seconds With my patch, it makes no difference if I use __builtin_memcpy or memcpy in lz4.c and lz4_wrapper.c so I just left it using memcpy. It is still slower than reverting the LZ4 update though. My patch: --- a/lib/lz4.c +++ b/lib/lz4.c @@ -41,6 +41,16 @@ static FORCE_INLINE u16 LZ4_readLE16(const void *src) return get_unaligned_le16(src); } +static FORCE_INLINE void LZ4_copy2(void *dst, const void *src) +{ + put_unaligned(get_unaligned((const u16 *)src), (u16 *)dst); +} + +static FORCE_INLINE void LZ4_copy4(void *dst, const void *src) +{ + put_unaligned(get_unaligned((const u32 *)src), (u32 *)dst); +} + static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) { put_unaligned(get_unaligned((const u64 *)src), (u64 *)dst); @@ -215,7 +225,10 @@ static FORCE_INLINE int LZ4_decompress_generic( && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend))) { /* Copy the literals */ - memcpy(op, ip, endOnInput ? 16 : 8); + LZ4_copy8(op, ip); + if (endOnInput) + LZ4_copy8(op + 8, ip + 8); + op += length; ip += length; /* @@ -234,9 +247,9 @@ static FORCE_INLINE int LZ4_decompress_generic( (offset >= 8) && (dict == withPrefix64k || match >= lowPrefix)) { /* Copy the match. */ - memcpy(op + 0, match + 0, 8); - memcpy(op + 8, match + 8, 8); - memcpy(op + 16, match + 16, 2); + LZ4_copy8(op, match); + LZ4_copy8(op + 8, match + 8); + LZ4_copy2(op + 16, match + 16); op += length + MINMATCH; /* Both stages worked, load the next token. */ continue; @@ -466,7 +479,7 @@ _copy_match: op[2] = match[2]; op[3] = match[3]; match += inc32table[offset]; - memcpy(op + 4, match, 4); + LZ4_copy4(op + 4, match); match -= dec64table[offset]; } else { LZ4_copy8(op, match); Let me know if you have any further suggestions. Thanks. Regards, Jonathan On Sun, 26 May 2024 at 22:18, Jianan Huang <jnhuan...@gmail.com> wrote: > > Hi Jonathan, > > Could you please try the following patch ? It replaces all memcpy() calls in > lz4 with __builtin_memcpy(). > > diff --git a/lib/lz4.c b/lib/lz4.c > index d365dc727c..2afe31c1c3 100644 > --- a/lib/lz4.c > +++ b/lib/lz4.c > @@ -34,6 +34,8 @@ > #include <asm/unaligned.h> > #include <u-boot/lz4.h> > > +#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) > + > #define FORCE_INLINE inline __attribute__((always_inline)) > > static FORCE_INLINE u16 LZ4_readLE16(const void *src) > @@ -215,7 +217,7 @@ static FORCE_INLINE int LZ4_decompress_generic( > && likely((endOnInput ? ip < shortiend : 1) & > (op <= shortoend))) { > /* Copy the literals */ > - memcpy(op, ip, endOnInput ? 16 : 8); > + LZ4_memcpy(op, ip, endOnInput ? 16 : 8); > op += length; ip += length; > > /* > @@ -234,9 +236,9 @@ static FORCE_INLINE int LZ4_decompress_generic( > (offset >= 8) && > (dict == withPrefix64k || match >= lowPrefix)) { > /* Copy the match. */ > - memcpy(op + 0, match + 0, 8); > - memcpy(op + 8, match + 8, 8); > - memcpy(op + 16, match + 16, 2); > + LZ4_memcpy(op + 0, match + 0, 8); > + LZ4_memcpy(op + 8, match + 8, 8); > + LZ4_memcpy(op + 16, match + 16, 2); > op += length + MINMATCH; > /* Both stages worked, load the next token. */ > continue; > @@ -416,7 +418,7 @@ _copy_match: > size_t const copySize = (size_t)(lowPrefix - match); > size_t const restSize = length - copySize; > > - memcpy(op, dictEnd - copySize, copySize); > + LZ4_memcpy(op, dictEnd - copySize, copySize); > op += copySize; > if (restSize > (size_t)(op - lowPrefix)) { > /* overlap copy */ > @@ -426,7 +428,7 @@ _copy_match: > while (op < endOfMatch) > *op++ = *copyFrom++; > } else { > - memcpy(op, lowPrefix, restSize); > + LZ4_memcpy(op, lowPrefix, restSize); > op += restSize; > } > } > @@ -452,7 +454,7 @@ _copy_match: > while (op < copyEnd) > *op++ = *match++; > } else { > - memcpy(op, match, mlen); > + LZ4_memcpy(op, match, mlen); > } > op = copyEnd; > if (op == oend) > @@ -466,7 +468,7 @@ _copy_match: > op[2] = match[2]; > op[3] = match[3]; > match += inc32table[offset]; > - memcpy(op + 4, match, 4); > + LZ4_memcpy(op + 4, match, 4); > match -= dec64table[offset]; > } else { > LZ4_copy8(op, match); > diff --git a/lib/lz4_wrapper.c b/lib/lz4_wrapper.c > index 4d48e7b0e8..e09c8d7057 100644 > --- a/lib/lz4_wrapper.c > +++ b/lib/lz4_wrapper.c > @@ -80,7 +80,7 @@ int ulz4fn(const void *src, size_t srcn, void *dst, size_t > *dstn) > > if (block_header & LZ4F_BLOCKUNCOMPRESSED_FLAG) { > size_t size = min((ptrdiff_t)block_size, (ptrdiff_t)(end - out)); > - memcpy(out, in, size); > + LZ4_memcpy(out, in, size); > out += size; > if (size < block_size) { > ret = -ENOBUFS; /* output overrun */ > > > Thanks, > > Jianan > > On 2024/5/26 16:06, Jonathan Liu wrote: > > Hi Gao, > > On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiang...@linux.alibaba.com> wrote: > > Hi, > > On 2024/5/24 22:26, Jonathan Liu wrote: > > Hi Jianan, > > On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuan...@gmail.com> wrote: > > Update the LZ4 compression module based on LZ4 v1.8.3 in order to > use the newest LZ4_decompress_safe_partial() which can now decode > exactly the nb of bytes requested. > > Signed-off-by: Huang Jianan <jnhuan...@gmail.com> > > I noticed after this commit LZ4 decompression is slower. > ulz4fn function call takes 1.209670 seconds with this commit. > After reverting this commit, the ulz4fn function call takes 0.587032 seconds. > > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4 > using -9 option for maximum compression) on RK3399. > > Any ideas why it is slower with this commit and how the performance > regression can be fixed? > > Just the quick glance, I think the issue may be due to memcpy/memmove > since it seems the main difference between these two codebases > (I'm not sure which LZ4 version the old codebase was based on) and > the new version mainly relies on memcpy/memmove instead of its own > versions. > > Would you mind to check the assembly how memcpy/memset is generated > on your platform? > > Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto): > 000000000028220c <memset>: > #if !CONFIG_IS_ENABLED(TINY_MEMSET) > unsigned long cl = 0; > int i; > > /* do it one word at a time (32 bits or 64 bits) while possible */ > if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) { > 28220c: f2400803 ands x3, x0, #0x7 > 282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any > for (i = 0; i < sizeof(*sl); i++) { > cl <<= 8; > cl |= c & 0xff; > 282214: 92401c26 and x6, x1, #0xff > unsigned long cl = 0; > 282218: d2800004 mov x4, #0x0 // #0 > 28221c: 52800105 mov w5, #0x8 // #8 > cl |= c & 0xff; > 282220: aa0420c4 orr x4, x6, x4, lsl #8 > for (i = 0; i < sizeof(*sl); i++) { > 282224: 710004a5 subs w5, w5, #0x1 > 282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any > } > while (count >= sizeof(*sl)) { > 28222c: cb030045 sub x5, x2, x3 > 282230: f1001cbf cmp x5, #0x7 > 282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore > 282238: d343fc43 lsr x3, x2, #3 > 28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 > 282240: 9b047c63 mul x3, x3, x4 > 282244: 8b030042 add x2, x2, x3 > 282248: cb030003 sub x3, x0, x3 > unsigned long *sl = (unsigned long *) s; > 28224c: d2800004 mov x4, #0x0 // #0 > count -= sizeof(*sl); > } > } > #endif /* fill 8 bits at a time */ > s8 = (char *)sl; > while (count--) > 282250: eb04005f cmp x2, x4 > 282254: 540000e1 b.ne 282270 <memset+0x64> // b.any > *s8++ = c; > > return s; > } > 282258: d65f03c0 ret > *sl++ = cl; > 28225c: f8236804 str x4, [x0, x3] > count -= sizeof(*sl); > 282260: 91002063 add x3, x3, #0x8 > 282264: 17fffff2 b 28222c <memset+0x20> > unsigned long *sl = (unsigned long *) s; > 282268: aa0003e3 mov x3, x0 > 28226c: 17fffff8 b 28224c <memset+0x40> > *s8++ = c; > 282270: 38246861 strb w1, [x3, x4] > 282274: 91000484 add x4, x4, #0x1 > 282278: 17fffff6 b 282250 <memset+0x44> > > 000000000028227c <memcpy>: > __used void * memcpy(void *dest, const void *src, size_t count) > { > unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src; > char *d8, *s8; > > if (src == dest) > 28227c: eb01001f cmp x0, x1 > 282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none > return dest; > > /* while all data is aligned (common case), copy a word at a time */ > if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) { > 282284: aa010003 orr x3, x0, x1 > 282288: f2400863 ands x3, x3, #0x7 > 28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none > 282290: aa0003e4 mov x4, x0 > 282294: d2800003 mov x3, #0x0 // #0 > } > } > /* copy the reset one byte at a time */ > d8 = (char *)dl; > s8 = (char *)sl; > while (count--) > 282298: eb03005f cmp x2, x3 > 28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any > *d8++ = *s8++; > > return dest; > } > 2822a0: d65f03c0 ret > *dl++ = *sl++; > 2822a4: f8636824 ldr x4, [x1, x3] > 2822a8: f8236804 str x4, [x0, x3] > count -= sizeof(*dl); > 2822ac: 91002063 add x3, x3, #0x8 > while (count >= sizeof(*dl)) { > 2822b0: cb030044 sub x4, x2, x3 > 2822b4: f1001c9f cmp x4, #0x7 > 2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore > 2822bc: d343fc43 lsr x3, x2, #3 > 2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 > 2822c4: 9b047c63 mul x3, x3, x4 > 2822c8: 8b030042 add x2, x2, x3 > 2822cc: cb030004 sub x4, x0, x3 > 2822d0: cb030021 sub x1, x1, x3 > 2822d4: 17fffff0 b 282294 <memcpy+0x18> > *d8++ = *s8++; > 2822d8: 38636825 ldrb w5, [x1, x3] > 2822dc: 38236885 strb w5, [x4, x3] > 2822e0: 91000463 add x3, x3, #0x1 > 2822e4: 17ffffed b 282298 <memcpy+0x1c> > > > I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in > the .config (but leaving it disabled in SPL/TPL) and it results in > Synchronous Abort: > U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000) > Trying to boot from MMC1 > NOTICE: BL31: v1.3(release):8f40012ab > NOTICE: BL31: Built : 14:20:53, Feb 16 2023 > NOTICE: BL31: Rockchip release version: v1.1 > INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in > EL3 > INFO: Using opteed sec cpu_context! > INFO: boot cpu mask: 0 > INFO: plat_rockchip_pmu_init(1203): pd status 3e > INFO: BL31: Initializing runtime services > WARNING: No OPTEE provided by BL2 boot loader, Booting device without > OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK > ERROR: Error initializing runtime service opteed_fast > INFO: BL31: Preparing for EL3 exit to normal world > INFO: Entry point address = 0x200000 > INFO: SPSR = 0x3c9 > "Synchronous Abort" handler, esr 0x96000021, far 0x2957e1 > elr: 000000000020233c lr : 000000000026a388 > x0 : 00000000002fbf38 x1 : 00000000002957e1 > x2 : 0000000000000008 x3 : 0000000000000065 > x4 : 00000000002957e9 x5 : 00000000002fbf40 > x6 : 0000000000000065 x7 : 0000000000000003 > x8 : 00000000002c7960 x9 : 000000000000ffd0 > x10: 00000000002fbc5c x11: 00000000000132e8 > x12: 00000000002fbce8 x13: 00000000002c7960 > x14: 00000000002c7960 x15: 0000000000000000 > x16: 0000000000000000 x17: 0000000000000000 > x18: 00000000002fbe30 x19: 0000000000000007 > x20: 00000000002957d8 x21: 0000000000000009 > x22: 000000000029d189 x23: 0000000000000020 > x24: 00000000002fbf38 x25: 00000000002957e7 > x26: 00000000002957b2 x27: 0000000000007fff > x28: 0000000000000000 x29: 00000000002fbcc0 > > Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026) > Resetting CPU ... > > resetting ... > > Thanks, > Gao Xiang > > Regards, > Jonathan