Hi Gao, On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiang...@linux.alibaba.com> wrote: > > Hi, > > On 2024/5/24 22:26, Jonathan Liu wrote: > > Hi Jianan, > > > > On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuan...@gmail.com> wrote: > >> > >> Update the LZ4 compression module based on LZ4 v1.8.3 in order to > >> use the newest LZ4_decompress_safe_partial() which can now decode > >> exactly the nb of bytes requested. > >> > >> Signed-off-by: Huang Jianan <jnhuan...@gmail.com> > > > > I noticed after this commit LZ4 decompression is slower. > > ulz4fn function call takes 1.209670 seconds with this commit. > > After reverting this commit, the ulz4fn function call takes 0.587032 > > seconds. > > > > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4 > > using -9 option for maximum compression) on RK3399. > > > > Any ideas why it is slower with this commit and how the performance > > regression can be fixed? > > Just the quick glance, I think the issue may be due to memcpy/memmove > since it seems the main difference between these two codebases > (I'm not sure which LZ4 version the old codebase was based on) and > the new version mainly relies on memcpy/memmove instead of its own > versions. >
> Would you mind to check the assembly how memcpy/memset is generated > on your platform? Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto): 000000000028220c <memset>: #if !CONFIG_IS_ENABLED(TINY_MEMSET) unsigned long cl = 0; int i; /* do it one word at a time (32 bits or 64 bits) while possible */ if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) { 28220c: f2400803 ands x3, x0, #0x7 282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any for (i = 0; i < sizeof(*sl); i++) { cl <<= 8; cl |= c & 0xff; 282214: 92401c26 and x6, x1, #0xff unsigned long cl = 0; 282218: d2800004 mov x4, #0x0 // #0 28221c: 52800105 mov w5, #0x8 // #8 cl |= c & 0xff; 282220: aa0420c4 orr x4, x6, x4, lsl #8 for (i = 0; i < sizeof(*sl); i++) { 282224: 710004a5 subs w5, w5, #0x1 282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any } while (count >= sizeof(*sl)) { 28222c: cb030045 sub x5, x2, x3 282230: f1001cbf cmp x5, #0x7 282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore 282238: d343fc43 lsr x3, x2, #3 28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 282240: 9b047c63 mul x3, x3, x4 282244: 8b030042 add x2, x2, x3 282248: cb030003 sub x3, x0, x3 unsigned long *sl = (unsigned long *) s; 28224c: d2800004 mov x4, #0x0 // #0 count -= sizeof(*sl); } } #endif /* fill 8 bits at a time */ s8 = (char *)sl; while (count--) 282250: eb04005f cmp x2, x4 282254: 540000e1 b.ne 282270 <memset+0x64> // b.any *s8++ = c; return s; } 282258: d65f03c0 ret *sl++ = cl; 28225c: f8236804 str x4, [x0, x3] count -= sizeof(*sl); 282260: 91002063 add x3, x3, #0x8 282264: 17fffff2 b 28222c <memset+0x20> unsigned long *sl = (unsigned long *) s; 282268: aa0003e3 mov x3, x0 28226c: 17fffff8 b 28224c <memset+0x40> *s8++ = c; 282270: 38246861 strb w1, [x3, x4] 282274: 91000484 add x4, x4, #0x1 282278: 17fffff6 b 282250 <memset+0x44> 000000000028227c <memcpy>: __used void * memcpy(void *dest, const void *src, size_t count) { unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src; char *d8, *s8; if (src == dest) 28227c: eb01001f cmp x0, x1 282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none return dest; /* while all data is aligned (common case), copy a word at a time */ if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) { 282284: aa010003 orr x3, x0, x1 282288: f2400863 ands x3, x3, #0x7 28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none 282290: aa0003e4 mov x4, x0 282294: d2800003 mov x3, #0x0 // #0 } } /* copy the reset one byte at a time */ d8 = (char *)dl; s8 = (char *)sl; while (count--) 282298: eb03005f cmp x2, x3 28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any *d8++ = *s8++; return dest; } 2822a0: d65f03c0 ret *dl++ = *sl++; 2822a4: f8636824 ldr x4, [x1, x3] 2822a8: f8236804 str x4, [x0, x3] count -= sizeof(*dl); 2822ac: 91002063 add x3, x3, #0x8 while (count >= sizeof(*dl)) { 2822b0: cb030044 sub x4, x2, x3 2822b4: f1001c9f cmp x4, #0x7 2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore 2822bc: d343fc43 lsr x3, x2, #3 2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8 2822c4: 9b047c63 mul x3, x3, x4 2822c8: 8b030042 add x2, x2, x3 2822cc: cb030004 sub x4, x0, x3 2822d0: cb030021 sub x1, x1, x3 2822d4: 17fffff0 b 282294 <memcpy+0x18> *d8++ = *s8++; 2822d8: 38636825 ldrb w5, [x1, x3] 2822dc: 38236885 strb w5, [x4, x3] 2822e0: 91000463 add x3, x3, #0x1 2822e4: 17ffffed b 282298 <memcpy+0x1c> I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in the .config (but leaving it disabled in SPL/TPL) and it results in Synchronous Abort: U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000) Trying to boot from MMC1 NOTICE: BL31: v1.3(release):8f40012ab NOTICE: BL31: Built : 14:20:53, Feb 16 2023 NOTICE: BL31: Rockchip release version: v1.1 INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3 INFO: Using opteed sec cpu_context! INFO: boot cpu mask: 0 INFO: plat_rockchip_pmu_init(1203): pd status 3e INFO: BL31: Initializing runtime services WARNING: No OPTEE provided by BL2 boot loader, Booting device without OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK ERROR: Error initializing runtime service opteed_fast INFO: BL31: Preparing for EL3 exit to normal world INFO: Entry point address = 0x200000 INFO: SPSR = 0x3c9 "Synchronous Abort" handler, esr 0x96000021, far 0x2957e1 elr: 000000000020233c lr : 000000000026a388 x0 : 00000000002fbf38 x1 : 00000000002957e1 x2 : 0000000000000008 x3 : 0000000000000065 x4 : 00000000002957e9 x5 : 00000000002fbf40 x6 : 0000000000000065 x7 : 0000000000000003 x8 : 00000000002c7960 x9 : 000000000000ffd0 x10: 00000000002fbc5c x11: 00000000000132e8 x12: 00000000002fbce8 x13: 00000000002c7960 x14: 00000000002c7960 x15: 0000000000000000 x16: 0000000000000000 x17: 0000000000000000 x18: 00000000002fbe30 x19: 0000000000000007 x20: 00000000002957d8 x21: 0000000000000009 x22: 000000000029d189 x23: 0000000000000020 x24: 00000000002fbf38 x25: 00000000002957e7 x26: 00000000002957b2 x27: 0000000000007fff x28: 0000000000000000 x29: 00000000002fbcc0 Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026) Resetting CPU ... resetting ... > > Thanks, > Gao Xiang Regards, Jonathan