Hi Gao,

On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiang...@linux.alibaba.com> wrote:
>
> Hi,
>
> On 2024/5/24 22:26, Jonathan Liu wrote:
> > Hi Jianan,
> >
> > On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuan...@gmail.com> wrote:
> >>
> >> Update the LZ4 compression module based on LZ4 v1.8.3 in order to
> >> use the newest LZ4_decompress_safe_partial() which can now decode
> >> exactly the nb of bytes requested.
> >>
> >> Signed-off-by: Huang Jianan <jnhuan...@gmail.com>
> >
> > I noticed after this commit LZ4 decompression is slower.
> > ulz4fn function call takes 1.209670 seconds with this commit.
> > After reverting this commit, the ulz4fn function call takes 0.587032 
> > seconds.
> >
> > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4
> > using -9 option for maximum compression) on RK3399.
> >
> > Any ideas why it is slower with this commit and how the performance
> > regression can be fixed?
>
> Just the quick glance, I think the issue may be due to memcpy/memmove
> since it seems the main difference between these two codebases
> (I'm not sure which LZ4 version the old codebase was based on) and
> the new version mainly relies on memcpy/memmove instead of its own
> versions.
>

> Would you mind to check the assembly how memcpy/memset is generated
> on your platform?

Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto):
000000000028220c <memset>:
#if !CONFIG_IS_ENABLED(TINY_MEMSET)
        unsigned long cl = 0;
        int i;

        /* do it one word at a time (32 bits or 64 bits) while possible */
        if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) {
  28220c:       f2400803        ands    x3, x0, #0x7
  282210:       540002c1        b.ne    282268 <memset+0x5c>  // b.any
                for (i = 0; i < sizeof(*sl); i++) {
                        cl <<= 8;
                        cl |= c & 0xff;
  282214:       92401c26        and     x6, x1, #0xff
        unsigned long cl = 0;
  282218:       d2800004        mov     x4, #0x0                        // #0
  28221c:       52800105        mov     w5, #0x8                        // #8
                        cl |= c & 0xff;
  282220:       aa0420c4        orr     x4, x6, x4, lsl #8
                for (i = 0; i < sizeof(*sl); i++) {
  282224:       710004a5        subs    w5, w5, #0x1
  282228:       54ffffc1        b.ne    282220 <memset+0x14>  // b.any
                }
                while (count >= sizeof(*sl)) {
  28222c:       cb030045        sub     x5, x2, x3
  282230:       f1001cbf        cmp     x5, #0x7
  282234:       54000148        b.hi    28225c <memset+0x50>  // b.pmore
  282238:       d343fc43        lsr     x3, x2, #3
  28223c:       928000e4        mov     x4, #0xfffffffffffffff8         // #-8
  282240:       9b047c63        mul     x3, x3, x4
  282244:       8b030042        add     x2, x2, x3
  282248:       cb030003        sub     x3, x0, x3
        unsigned long *sl = (unsigned long *) s;
  28224c:       d2800004        mov     x4, #0x0                        // #0
                        count -= sizeof(*sl);
                }
        }
#endif  /* fill 8 bits at a time */
        s8 = (char *)sl;
        while (count--)
  282250:       eb04005f        cmp     x2, x4
  282254:       540000e1        b.ne    282270 <memset+0x64>  // b.any
                *s8++ = c;

        return s;
}
  282258:       d65f03c0        ret
                        *sl++ = cl;
  28225c:       f8236804        str     x4, [x0, x3]
                        count -= sizeof(*sl);
  282260:       91002063        add     x3, x3, #0x8
  282264:       17fffff2        b       28222c <memset+0x20>
        unsigned long *sl = (unsigned long *) s;
  282268:       aa0003e3        mov     x3, x0
  28226c:       17fffff8        b       28224c <memset+0x40>
                *s8++ = c;
  282270:       38246861        strb    w1, [x3, x4]
  282274:       91000484        add     x4, x4, #0x1
  282278:       17fffff6        b       282250 <memset+0x44>

000000000028227c <memcpy>:
__used void * memcpy(void *dest, const void *src, size_t count)
{
        unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src;
        char *d8, *s8;

        if (src == dest)
  28227c:       eb01001f        cmp     x0, x1
  282280:       54000100        b.eq    2822a0 <memcpy+0x24>  // b.none
                return dest;

        /* while all data is aligned (common case), copy a word at a time */
        if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) {
  282284:       aa010003        orr     x3, x0, x1
  282288:       f2400863        ands    x3, x3, #0x7
  28228c:       54000120        b.eq    2822b0 <memcpy+0x34>  // b.none
  282290:       aa0003e4        mov     x4, x0
  282294:       d2800003        mov     x3, #0x0                        // #0
                }
        }
        /* copy the reset one byte at a time */
        d8 = (char *)dl;
        s8 = (char *)sl;
        while (count--)
  282298:       eb03005f        cmp     x2, x3
  28229c:       540001e1        b.ne    2822d8 <memcpy+0x5c>  // b.any
                *d8++ = *s8++;

        return dest;
}
  2822a0:       d65f03c0        ret
                        *dl++ = *sl++;
  2822a4:       f8636824        ldr     x4, [x1, x3]
  2822a8:       f8236804        str     x4, [x0, x3]
                        count -= sizeof(*dl);
  2822ac:       91002063        add     x3, x3, #0x8
                while (count >= sizeof(*dl)) {
  2822b0:       cb030044        sub     x4, x2, x3
  2822b4:       f1001c9f        cmp     x4, #0x7
  2822b8:       54ffff68        b.hi    2822a4 <memcpy+0x28>  // b.pmore
  2822bc:       d343fc43        lsr     x3, x2, #3
  2822c0:       928000e4        mov     x4, #0xfffffffffffffff8         // #-8
  2822c4:       9b047c63        mul     x3, x3, x4
  2822c8:       8b030042        add     x2, x2, x3
  2822cc:       cb030004        sub     x4, x0, x3
  2822d0:       cb030021        sub     x1, x1, x3
  2822d4:       17fffff0        b       282294 <memcpy+0x18>
                *d8++ = *s8++;
  2822d8:       38636825        ldrb    w5, [x1, x3]
  2822dc:       38236885        strb    w5, [x4, x3]
  2822e0:       91000463        add     x3, x3, #0x1
  2822e4:       17ffffed        b       282298 <memcpy+0x1c>


I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in
the .config (but leaving it disabled in SPL/TPL) and it results in
Synchronous Abort:
U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000)
Trying to boot from MMC1
NOTICE:  BL31: v1.3(release):8f40012ab
NOTICE:  BL31: Built : 14:20:53, Feb 16 2023
NOTICE:  BL31: Rockchip release version: v1.1
INFO:    GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3
INFO:    Using opteed sec cpu_context!
INFO:    boot cpu mask: 0
INFO:    plat_rockchip_pmu_init(1203): pd status 3e
INFO:    BL31: Initializing runtime services
WARNING: No OPTEE provided by BL2 boot loader, Booting device without
OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK
ERROR:   Error initializing runtime service opteed_fast
INFO:    BL31: Preparing for EL3 exit to normal world
INFO:    Entry point address = 0x200000
INFO:    SPSR = 0x3c9
"Synchronous Abort" handler, esr 0x96000021, far 0x2957e1
elr: 000000000020233c lr : 000000000026a388
x0 : 00000000002fbf38 x1 : 00000000002957e1
x2 : 0000000000000008 x3 : 0000000000000065
x4 : 00000000002957e9 x5 : 00000000002fbf40
x6 : 0000000000000065 x7 : 0000000000000003
x8 : 00000000002c7960 x9 : 000000000000ffd0
x10: 00000000002fbc5c x11: 00000000000132e8
x12: 00000000002fbce8 x13: 00000000002c7960
x14: 00000000002c7960 x15: 0000000000000000
x16: 0000000000000000 x17: 0000000000000000
x18: 00000000002fbe30 x19: 0000000000000007
x20: 00000000002957d8 x21: 0000000000000009
x22: 000000000029d189 x23: 0000000000000020
x24: 00000000002fbf38 x25: 00000000002957e7
x26: 00000000002957b2 x27: 0000000000007fff
x28: 0000000000000000 x29: 00000000002fbcc0

Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026)
Resetting CPU ...

resetting ...

>
> Thanks,
> Gao Xiang

Regards,
Jonathan

Reply via email to