Re: [PATCH 2/2] ARM: dts: renesas: Drop R8A779H0 V4M DTs with OF_UPSTREAM counterparts

2024-05-26 Thread Sumit Garg
On Sun, 26 May 2024 at 23:31, Marek Vasut
 wrote:
>
> Remove R8A779H0 V4M DTs which are now replaced by OF_UPSTREAM counterparts.
> No functional change expected.
>
> This patch finalizes OF_UPSTREAM conversion of R8A779H0 V4M which DTs landed
> in Linux 6.9 .
>
> Signed-off-by: Marek Vasut 
> ---
> Cc: Adam Ford 
> Cc: Hai Pham 
> Cc: Paul Barker 
> Cc: Sumit Garg 
> Cc: Tom Rini 
> Cc: u-boot@lists.denx.de
> ---
>  arch/arm/dts/Makefile |   3 -
>  arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi  | 166 ---
>  arch/arm/dts/r8a779h0-gray-hawk-csi-dsi.dtsi  |  15 -
>  arch/arm/dts/r8a779h0-gray-hawk-ethernet.dtsi |  15 -
>  arch/arm/dts/r8a779h0-gray-hawk-u-boot.dtsi   |  41 --
>  arch/arm/dts/r8a779h0-gray-hawk.dts   |  25 -
>  arch/arm/dts/r8a779h0-u-boot.dtsi |  27 -
>  arch/arm/dts/r8a779h0.dtsi| 460 --
>  8 files changed, 752 deletions(-)
>  delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi
>  delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-csi-dsi.dtsi
>  delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-ethernet.dtsi
>  delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-u-boot.dtsi
>  delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk.dts
>  delete mode 100644 arch/arm/dts/r8a779h0-u-boot.dtsi
>  delete mode 100644 arch/arm/dts/r8a779h0.dtsi
>

Acked-by: Sumit Garg 

-Sumit

> diff --git a/arch/arm/dts/Makefile b/arch/arm/dts/Makefile
> index f7032f1e175..6d5ff1bc27b 100644
> --- a/arch/arm/dts/Makefile
> +++ b/arch/arm/dts/Makefile
> @@ -958,9 +958,6 @@ dtb-$(CONFIG_ARCH_IMXRT) += imxrt1050-evk.dtb \
> imxrt1020-evk.dtb \
> imxrt1170-evk.dtb \
>
> -dtb-$(CONFIG_RCAR_GEN4) += \
> -   r8a779h0-gray-hawk.dtb
> -
>  dtb-$(CONFIG_TARGET_RZG2L) += \
> r9a07g044l2-smarc.dts
>
> diff --git a/arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi 
> b/arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi
> deleted file mode 100644
> index c8a46219826..000
> --- a/arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi
> +++ /dev/null
> @@ -1,166 +0,0 @@
> -// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> -/*
> - * Device Tree Source for the Gray Hawk CPU board
> - *
> - * Copyright (C) 2023 Renesas Electronics Corp.
> - */
> -
> -#include 
> -#include 
> -
> -#include "r8a779h0.dtsi"
> -
> -/ {
> -   model = "Renesas Gray Hawk CPU board";
> -   compatible = "renesas,grayhawk-cpu", "renesas,r8a779h0";
> -
> -   aliases {
> -   ethernet0 = 
> -   serial0 = 
> -   };
> -
> -   chosen {
> -   bootargs = "ignore_loglevel";
> -   stdout-path = "serial0:921600n8";
> -   };
> -
> -   memory@4800 {
> -   device_type = "memory";
> -   /* first 128MB is reserved for secure area. */
> -   reg = <0x0 0x4800 0x0 0x7800>;
> -   };
> -
> -   memory@48000 {
> -   device_type = "memory";
> -   reg = <0x4 0x8000 0x1 0x8000>;
> -   };
> -
> -   reg_1p8v: regulator-1p8v {
> -   compatible = "regulator-fixed";
> -   regulator-name = "fixed-1.8V";
> -   regulator-min-microvolt = <180>;
> -   regulator-max-microvolt = <180>;
> -   regulator-boot-on;
> -   regulator-always-on;
> -   };
> -
> -   reg_3p3v: regulator-3p3v {
> -   compatible = "regulator-fixed";
> -   regulator-name = "fixed-3.3V";
> -   regulator-min-microvolt = <330>;
> -   regulator-max-microvolt = <330>;
> -   regulator-boot-on;
> -   regulator-always-on;
> -   };
> -};
> -
> - {
> -   pinctrl-0 = <_pins>;
> -   pinctrl-names = "default";
> -   phy-handle = <>;
> -   tx-internal-delay-ps = <2000>;
> -   status = "okay";
> -
> -   phy0: ethernet-phy@0 {
> -   compatible = "ethernet-phy-id0022.1622",
> -"ethernet-phy-ieee802.3-c22";
> -   rxc-skew-ps = <1500>;
> -   reg = <0>;
> -   interrupt-parent = <>;
> -   interrupts = <5 IRQ_TYPE_LEVEL_LOW>;
> -   reset-gpios = < 10 GPIO_ACTIVE_LOW>;
> -   };
> -};
> -
> -_clk {
> -   clock-frequency = <1666>;
> -};
> -
> -_clk {
> -   clock-frequency = <32768>;
> -};
> -
> - {
> -   uart-has-rtscts;
> -   status = "okay";
> -};
> -
> - {
> -   pinctrl-0 = <_pins>;
> -   pinctrl-names = "default";
> -
> -   status = "okay";
> -   clock-frequency = <40>;
> -
> -   eeprom@50 {
> -   compatible = "rohm,br24g01", "atmel,24c01";
> -   label = "cpu-board";
> -   reg = <0x50>;
> -   pagesize = <8>;
> -   };
> -};
> -
> - {
> -   pinctrl-0 = <_pins>;
> -   pinctrl-1 = 

Re: [PATCH 1/2] ARM: dts: renesas: Switch to using upstream DT on Renesas R8A779H0 V4M

2024-05-26 Thread Sumit Garg
On Sun, 26 May 2024 at 23:31, Marek Vasut
 wrote:
>
> Enable OF_UPSTREAM to use upstream DT and add renesas/ prefix to the
> DEFAULT_DEVICE_TREE. And thereby directly build DTB from dts/upstream/src/
> including *-u-boot.dtsi files from arch/$(ARCH)/dts/ directory.
>
> This patch finalizes OF_UPSTREAM conversion of R8A779H0 V4M which DTs
> landed in Linux 6.9 .
>
> Signed-off-by: Marek Vasut 
> ---
> Cc: Adam Ford 
> Cc: Hai Pham 
> Cc: Paul Barker 
> Cc: Sumit Garg 
> Cc: Tom Rini 
> Cc: u-boot@lists.denx.de
> ---
>  configs/r8a779h0_grayhawk_defconfig | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>

Acked-by: Sumit Garg 

-Sumit

> diff --git a/configs/r8a779h0_grayhawk_defconfig 
> b/configs/r8a779h0_grayhawk_defconfig
> index 6bd872f063f..a986a09b8e0 100644
> --- a/configs/r8a779h0_grayhawk_defconfig
> +++ b/configs/r8a779h0_grayhawk_defconfig
> @@ -5,7 +5,7 @@ CONFIG_SYS_MALLOC_LEN=0x400
>  CONFIG_ENV_SIZE=0x2
>  CONFIG_ENV_OFFSET=0xFFFE
>  CONFIG_DM_GPIO=y
> -CONFIG_DEFAULT_DEVICE_TREE="r8a779h0-gray-hawk"
> +CONFIG_DEFAULT_DEVICE_TREE="renesas/r8a779h0-gray-hawk-single"
>  CONFIG_RCAR_GEN4=y
>  CONFIG_TARGET_GRAYHAWK=y
>  CONFIG_SYS_MONITOR_LEN=1048576
> @@ -39,7 +39,6 @@ CONFIG_CMD_EXT4_WRITE=y
>  CONFIG_CMD_FAT=y
>  CONFIG_CMD_FS_GENERIC=y
>  CONFIG_OF_CONTROL=y
> -# CONFIG_OF_UPSTREAM is not set
>  CONFIG_ENV_IS_IN_MMC=y
>  CONFIG_SYS_RELOC_GD_ENV_ADDR=y
>  CONFIG_SYS_MMC_ENV_PART=2
> --
> 2.43.0
>


[PATCH 2/2] ARM: dts: renesas: Drop R8A779H0 V4M DTs with OF_UPSTREAM counterparts

2024-05-26 Thread Marek Vasut
Remove R8A779H0 V4M DTs which are now replaced by OF_UPSTREAM counterparts.
No functional change expected.

This patch finalizes OF_UPSTREAM conversion of R8A779H0 V4M which DTs landed
in Linux 6.9 .

Signed-off-by: Marek Vasut 
---
Cc: Adam Ford 
Cc: Hai Pham 
Cc: Paul Barker 
Cc: Sumit Garg 
Cc: Tom Rini 
Cc: u-boot@lists.denx.de
---
 arch/arm/dts/Makefile |   3 -
 arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi  | 166 ---
 arch/arm/dts/r8a779h0-gray-hawk-csi-dsi.dtsi  |  15 -
 arch/arm/dts/r8a779h0-gray-hawk-ethernet.dtsi |  15 -
 arch/arm/dts/r8a779h0-gray-hawk-u-boot.dtsi   |  41 --
 arch/arm/dts/r8a779h0-gray-hawk.dts   |  25 -
 arch/arm/dts/r8a779h0-u-boot.dtsi |  27 -
 arch/arm/dts/r8a779h0.dtsi| 460 --
 8 files changed, 752 deletions(-)
 delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi
 delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-csi-dsi.dtsi
 delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-ethernet.dtsi
 delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk-u-boot.dtsi
 delete mode 100644 arch/arm/dts/r8a779h0-gray-hawk.dts
 delete mode 100644 arch/arm/dts/r8a779h0-u-boot.dtsi
 delete mode 100644 arch/arm/dts/r8a779h0.dtsi

diff --git a/arch/arm/dts/Makefile b/arch/arm/dts/Makefile
index f7032f1e175..6d5ff1bc27b 100644
--- a/arch/arm/dts/Makefile
+++ b/arch/arm/dts/Makefile
@@ -958,9 +958,6 @@ dtb-$(CONFIG_ARCH_IMXRT) += imxrt1050-evk.dtb \
imxrt1020-evk.dtb \
imxrt1170-evk.dtb \
 
-dtb-$(CONFIG_RCAR_GEN4) += \
-   r8a779h0-gray-hawk.dtb
-
 dtb-$(CONFIG_TARGET_RZG2L) += \
r9a07g044l2-smarc.dts
 
diff --git a/arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi 
b/arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi
deleted file mode 100644
index c8a46219826..000
--- a/arch/arm/dts/r8a779h0-gray-hawk-cpu.dtsi
+++ /dev/null
@@ -1,166 +0,0 @@
-// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
-/*
- * Device Tree Source for the Gray Hawk CPU board
- *
- * Copyright (C) 2023 Renesas Electronics Corp.
- */
-
-#include 
-#include 
-
-#include "r8a779h0.dtsi"
-
-/ {
-   model = "Renesas Gray Hawk CPU board";
-   compatible = "renesas,grayhawk-cpu", "renesas,r8a779h0";
-
-   aliases {
-   ethernet0 = 
-   serial0 = 
-   };
-
-   chosen {
-   bootargs = "ignore_loglevel";
-   stdout-path = "serial0:921600n8";
-   };
-
-   memory@4800 {
-   device_type = "memory";
-   /* first 128MB is reserved for secure area. */
-   reg = <0x0 0x4800 0x0 0x7800>;
-   };
-
-   memory@48000 {
-   device_type = "memory";
-   reg = <0x4 0x8000 0x1 0x8000>;
-   };
-
-   reg_1p8v: regulator-1p8v {
-   compatible = "regulator-fixed";
-   regulator-name = "fixed-1.8V";
-   regulator-min-microvolt = <180>;
-   regulator-max-microvolt = <180>;
-   regulator-boot-on;
-   regulator-always-on;
-   };
-
-   reg_3p3v: regulator-3p3v {
-   compatible = "regulator-fixed";
-   regulator-name = "fixed-3.3V";
-   regulator-min-microvolt = <330>;
-   regulator-max-microvolt = <330>;
-   regulator-boot-on;
-   regulator-always-on;
-   };
-};
-
- {
-   pinctrl-0 = <_pins>;
-   pinctrl-names = "default";
-   phy-handle = <>;
-   tx-internal-delay-ps = <2000>;
-   status = "okay";
-
-   phy0: ethernet-phy@0 {
-   compatible = "ethernet-phy-id0022.1622",
-"ethernet-phy-ieee802.3-c22";
-   rxc-skew-ps = <1500>;
-   reg = <0>;
-   interrupt-parent = <>;
-   interrupts = <5 IRQ_TYPE_LEVEL_LOW>;
-   reset-gpios = < 10 GPIO_ACTIVE_LOW>;
-   };
-};
-
-_clk {
-   clock-frequency = <1666>;
-};
-
-_clk {
-   clock-frequency = <32768>;
-};
-
- {
-   uart-has-rtscts;
-   status = "okay";
-};
-
- {
-   pinctrl-0 = <_pins>;
-   pinctrl-names = "default";
-
-   status = "okay";
-   clock-frequency = <40>;
-
-   eeprom@50 {
-   compatible = "rohm,br24g01", "atmel,24c01";
-   label = "cpu-board";
-   reg = <0x50>;
-   pagesize = <8>;
-   };
-};
-
- {
-   pinctrl-0 = <_pins>;
-   pinctrl-1 = <_pins>;
-   pinctrl-names = "default", "state_uhs";
-
-   vmmc-supply = <_3p3v>;
-   vqmmc-supply = <_1p8v>;
-   mmc-hs200-1_8v;
-   mmc-hs400-1_8v;
-   bus-width = <8>;
-   no-sd;
-   no-sdio;
-   non-removable;
-   full-pwr-cycle-in-suspend;
-   status = "okay";
-};
-
- {
-   pinctrl-0 = <_clk_pins>;
-   pinctrl-names = "default";
-
-   

[PATCH 1/2] ARM: dts: renesas: Switch to using upstream DT on Renesas R8A779H0 V4M

2024-05-26 Thread Marek Vasut
Enable OF_UPSTREAM to use upstream DT and add renesas/ prefix to the
DEFAULT_DEVICE_TREE. And thereby directly build DTB from dts/upstream/src/
including *-u-boot.dtsi files from arch/$(ARCH)/dts/ directory.

This patch finalizes OF_UPSTREAM conversion of R8A779H0 V4M which DTs
landed in Linux 6.9 .

Signed-off-by: Marek Vasut 
---
Cc: Adam Ford 
Cc: Hai Pham 
Cc: Paul Barker 
Cc: Sumit Garg 
Cc: Tom Rini 
Cc: u-boot@lists.denx.de
---
 configs/r8a779h0_grayhawk_defconfig | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/configs/r8a779h0_grayhawk_defconfig 
b/configs/r8a779h0_grayhawk_defconfig
index 6bd872f063f..a986a09b8e0 100644
--- a/configs/r8a779h0_grayhawk_defconfig
+++ b/configs/r8a779h0_grayhawk_defconfig
@@ -5,7 +5,7 @@ CONFIG_SYS_MALLOC_LEN=0x400
 CONFIG_ENV_SIZE=0x2
 CONFIG_ENV_OFFSET=0xFFFE
 CONFIG_DM_GPIO=y
-CONFIG_DEFAULT_DEVICE_TREE="r8a779h0-gray-hawk"
+CONFIG_DEFAULT_DEVICE_TREE="renesas/r8a779h0-gray-hawk-single"
 CONFIG_RCAR_GEN4=y
 CONFIG_TARGET_GRAYHAWK=y
 CONFIG_SYS_MONITOR_LEN=1048576
@@ -39,7 +39,6 @@ CONFIG_CMD_EXT4_WRITE=y
 CONFIG_CMD_FAT=y
 CONFIG_CMD_FS_GENERIC=y
 CONFIG_OF_CONTROL=y
-# CONFIG_OF_UPSTREAM is not set
 CONFIG_ENV_IS_IN_MMC=y
 CONFIG_SYS_RELOC_GD_ENV_ADDR=y
 CONFIG_SYS_MMC_ENV_PART=2
-- 
2.43.0



Re: [PATCH v4 2/5] lib/lz4: update LZ4 decompressor module

2024-05-26 Thread Jianan Huang
Hi Jonathan,

Could you please try the following patch ? It replaces all memcpy() calls
in lz4 with __builtin_memcpy().

diff --git a/lib/lz4.c b/lib/lz4.c
index d365dc727c..2afe31c1c3 100644
--- a/lib/lz4.c
+++ b/lib/lz4.c
@@ -34,6 +34,8 @@
 #include 
 #include 

+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+
 #define FORCE_INLINE inline __attribute__((always_inline))

 static FORCE_INLINE u16 LZ4_readLE16(const void *src)
@@ -215,7 +217,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
&& likely((endOnInput ? ip < shortiend : 1) &
  (op <= shortoend))) {
  /* Copy the literals */
- memcpy(op, ip, endOnInput ? 16 : 8);
+ LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
  op += length; ip += length;

  /*
@@ -234,9 +236,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
 (offset >= 8) &&
 (dict == withPrefix64k || match >= lowPrefix)) {
  /* Copy the match. */
- memcpy(op + 0, match + 0, 8);
- memcpy(op + 8, match + 8, 8);
- memcpy(op + 16, match + 16, 2);
+ LZ4_memcpy(op + 0, match + 0, 8);
+ LZ4_memcpy(op + 8, match + 8, 8);
+ LZ4_memcpy(op + 16, match + 16, 2);
  op += length + MINMATCH;
  /* Both stages worked, load the next token. */
  continue;
@@ -416,7 +418,7 @@ _copy_match:
  size_t const copySize = (size_t)(lowPrefix - match);
  size_t const restSize = length - copySize;

- memcpy(op, dictEnd - copySize, copySize);
+ LZ4_memcpy(op, dictEnd - copySize, copySize);
  op += copySize;
  if (restSize > (size_t)(op - lowPrefix)) {
  /* overlap copy */
@@ -426,7 +428,7 @@ _copy_match:
  while (op < endOfMatch)
  *op++ = *copyFrom++;
  } else {
- memcpy(op, lowPrefix, restSize);
+ LZ4_memcpy(op, lowPrefix, restSize);
  op += restSize;
  }
  }
@@ -452,7 +454,7 @@ _copy_match:
  while (op < copyEnd)
  *op++ = *match++;
  } else {
- memcpy(op, match, mlen);
+ LZ4_memcpy(op, match, mlen);
  }
  op = copyEnd;
  if (op == oend)
@@ -466,7 +468,7 @@ _copy_match:
  op[2] = match[2];
  op[3] = match[3];
  match += inc32table[offset];
- memcpy(op + 4, match, 4);
+ LZ4_memcpy(op + 4, match, 4);
  match -= dec64table[offset];
  } else {
  LZ4_copy8(op, match);
diff --git a/lib/lz4_wrapper.c b/lib/lz4_wrapper.c
index 4d48e7b0e8..e09c8d7057 100644
--- a/lib/lz4_wrapper.c
+++ b/lib/lz4_wrapper.c
@@ -80,7 +80,7 @@ int ulz4fn(const void *src, size_t srcn, void *dst,
size_t *dstn)

  if (block_header & LZ4F_BLOCKUNCOMPRESSED_FLAG) {
  size_t size = min((ptrdiff_t)block_size, (ptrdiff_t)(end - out));
- memcpy(out, in, size);
+ LZ4_memcpy(out, in, size);
  out += size;
  if (size < block_size) {
  ret = -ENOBUFS; /* output overrun */


Thanks,

Jianan
On 2024/5/26 16:06, Jonathan Liu wrote:

Hi Gao,

On Sat, 25 May 2024 at 02:52, Gao Xiang 
 wrote:

Hi,

On 2024/5/24 22:26, Jonathan Liu wrote:

Hi Jianan,

On Sat, 26 Feb 2022 at 18:05, Huang Jianan 
 wrote:

Update the LZ4 compression module based on LZ4 v1.8.3 in order to
use the newest LZ4_decompress_safe_partial() which can now decode
exactly the nb of bytes requested.

Signed-off-by: Huang Jianan  

I noticed after this commit LZ4 decompression is slower.
ulz4fn function call takes 1.209670 seconds with this commit.
After reverting this commit, the ulz4fn function call takes 0.587032 seconds.

I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4
using -9 option for maximum compression) on RK3399.

Any ideas why it is slower with this commit and how the performance
regression can be fixed?

Just the quick glance, I think the issue may be due to memcpy/memmove
since it seems the main difference between these two codebases
(I'm not sure which LZ4 version the old codebase was based on) and
the new version mainly relies on memcpy/memmove instead of its own
versions.


Would you mind to check the assembly how memcpy/memset is generated
on your platform?

Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto):
0028220c :
#if !CONFIG_IS_ENABLED(TINY_MEMSET)
unsigned long cl = 0;
int i;

/* do it one word at a time (32 bits or 64 bits) while possible */
if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) {
  28220c:   f2400803andsx3, x0, #0x7
  282210:   540002c1b.ne282268   // b.any
for (i = 0; i < sizeof(*sl); i++) {
cl <<= 8;
cl |= c & 0xff;
  282214:   92401c26and x6, x1, #0xff
unsigned long cl = 0;
  282218:   d284mov x4, #0x0// #0
  28221c:   52800105mov w5, #0x8// #8
cl |= c & 0xff;
  282220:   aa0420c4orr x4, x6, x4, lsl #8
for (i = 0; i < sizeof(*sl); i++) {
  282224:   710004a5subsw5, w5, #0x1
  282228:   54c1b.ne282220   // b.any
}
while (count >= sizeof(*sl)) {
  28222c:   cb030045sub x5, x2, x3
  282230:   f1001cbf   

Re: [PATCH v4 2/5] lib/lz4: update LZ4 decompressor module

2024-05-26 Thread Jonathan Liu
Hi Gao,

On Sat, 25 May 2024 at 02:52, Gao Xiang  wrote:
>
> Hi,
>
> On 2024/5/24 22:26, Jonathan Liu wrote:
> > Hi Jianan,
> >
> > On Sat, 26 Feb 2022 at 18:05, Huang Jianan  wrote:
> >>
> >> Update the LZ4 compression module based on LZ4 v1.8.3 in order to
> >> use the newest LZ4_decompress_safe_partial() which can now decode
> >> exactly the nb of bytes requested.
> >>
> >> Signed-off-by: Huang Jianan 
> >
> > I noticed after this commit LZ4 decompression is slower.
> > ulz4fn function call takes 1.209670 seconds with this commit.
> > After reverting this commit, the ulz4fn function call takes 0.587032 
> > seconds.
> >
> > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4
> > using -9 option for maximum compression) on RK3399.
> >
> > Any ideas why it is slower with this commit and how the performance
> > regression can be fixed?
>
> Just the quick glance, I think the issue may be due to memcpy/memmove
> since it seems the main difference between these two codebases
> (I'm not sure which LZ4 version the old codebase was based on) and
> the new version mainly relies on memcpy/memmove instead of its own
> versions.
>

> Would you mind to check the assembly how memcpy/memset is generated
> on your platform?

Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto):
0028220c :
#if !CONFIG_IS_ENABLED(TINY_MEMSET)
unsigned long cl = 0;
int i;

/* do it one word at a time (32 bits or 64 bits) while possible */
if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) {
  28220c:   f2400803andsx3, x0, #0x7
  282210:   540002c1b.ne282268   // b.any
for (i = 0; i < sizeof(*sl); i++) {
cl <<= 8;
cl |= c & 0xff;
  282214:   92401c26and x6, x1, #0xff
unsigned long cl = 0;
  282218:   d284mov x4, #0x0// #0
  28221c:   52800105mov w5, #0x8// #8
cl |= c & 0xff;
  282220:   aa0420c4orr x4, x6, x4, lsl #8
for (i = 0; i < sizeof(*sl); i++) {
  282224:   710004a5subsw5, w5, #0x1
  282228:   54c1b.ne282220   // b.any
}
while (count >= sizeof(*sl)) {
  28222c:   cb030045sub x5, x2, x3
  282230:   f1001cbfcmp x5, #0x7
  282234:   54000148b.hi28225c   // b.pmore
  282238:   d343fc43lsr x3, x2, #3
  28223c:   928000e4mov x4, #0xfff8 // #-8
  282240:   9b047c63mul x3, x3, x4
  282244:   8b030042add x2, x2, x3
  282248:   cb030003sub x3, x0, x3
unsigned long *sl = (unsigned long *) s;
  28224c:   d284mov x4, #0x0// #0
count -= sizeof(*sl);
}
}
#endif  /* fill 8 bits at a time */
s8 = (char *)sl;
while (count--)
  282250:   eb04005fcmp x2, x4
  282254:   54e1b.ne282270   // b.any
*s8++ = c;

return s;
}
  282258:   d65f03c0ret
*sl++ = cl;
  28225c:   f8236804str x4, [x0, x3]
count -= sizeof(*sl);
  282260:   91002063add x3, x3, #0x8
  282264:   17f2b   28222c 
unsigned long *sl = (unsigned long *) s;
  282268:   aa0003e3mov x3, x0
  28226c:   17f8b   28224c 
*s8++ = c;
  282270:   38246861strbw1, [x3, x4]
  282274:   91000484add x4, x4, #0x1
  282278:   17f6b   282250 

0028227c :
__used void * memcpy(void *dest, const void *src, size_t count)
{
unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src;
char *d8, *s8;

if (src == dest)
  28227c:   eb01001fcmp x0, x1
  282280:   54000100b.eq2822a0   // b.none
return dest;

/* while all data is aligned (common case), copy a word at a time */
if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) {
  282284:   aa010003orr x3, x0, x1
  282288:   f2400863andsx3, x3, #0x7
  28228c:   54000120b.eq2822b0   // b.none
  282290:   aa0003e4mov x4, x0
  282294:   d283mov x3, #0x0// #0
}
}
/* copy the reset one byte at a time */
d8 = (char *)dl;
s8 = (char *)sl;
while (count--)
  282298:   eb03005fcmp x2, x3
  28229c:   540001e1b.ne2822d8   // b.any
*d8++ = *s8++;

return dest;
}
  2822a0:   d65f03c0ret