Re: [RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions

2011-11-15 Thread Benjamin Herrenschmidt
On Mon, 2011-11-14 at 21:32 -0500, Kyle Moffett wrote:
> These functions are only used from one place each.  If the cacheable_*
> versions really are more efficient, then those changes should be
> migrated into the common code instead.
> 
> NOTE: The old routines are just flat buggy on kernels that support
>   hardware with different cacheline sizes.
> 
> Signed-off-by: Kyle Moffett 
> ---

Right, considering where those are used, I think we can safely remove
them. Thanks.

Ben.

>  arch/powerpc/include/asm/system.h|2 -
>  arch/powerpc/kernel/ppc_ksyms.c  |2 -
>  arch/powerpc/lib/copy_32.S   |  127 
> --
>  arch/powerpc/mm/ppc_mmu_32.c |2 +-
>  drivers/net/ethernet/ibm/emac/core.c |   12 +---
>  5 files changed, 3 insertions(+), 142 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/system.h 
> b/arch/powerpc/include/asm/system.h
> index e30a13d..25389d1 100644
> --- a/arch/powerpc/include/asm/system.h
> +++ b/arch/powerpc/include/asm/system.h
> @@ -189,8 +189,6 @@ static inline void flush_spe_to_thread(struct task_struct 
> *t)
>  #endif
>  
>  extern int call_rtas(const char *, int, int, unsigned long *, ...);
> -extern void cacheable_memzero(void *p, unsigned int nb);
> -extern void *cacheable_memcpy(void *, const void *, unsigned int);
>  extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
>  extern void bad_page_fault(struct pt_regs *, unsigned long, int);
>  extern int die(const char *, struct pt_regs *, long);
> diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
> index d3114a7..acba8ce 100644
> --- a/arch/powerpc/kernel/ppc_ksyms.c
> +++ b/arch/powerpc/kernel/ppc_ksyms.c
> @@ -159,8 +159,6 @@ EXPORT_SYMBOL(screen_info);
>  #ifdef CONFIG_PPC32
>  EXPORT_SYMBOL(timer_interrupt);
>  EXPORT_SYMBOL(tb_ticks_per_jiffy);
> -EXPORT_SYMBOL(cacheable_memcpy);
> -EXPORT_SYMBOL(cacheable_memzero);
>  #endif
>  
>  #ifdef CONFIG_PPC32
> diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
> index 55f19f9..6813f80 100644
> --- a/arch/powerpc/lib/copy_32.S
> +++ b/arch/powerpc/lib/copy_32.S
> @@ -69,54 +69,6 @@ CACHELINE_BYTES = L1_CACHE_BYTES
>  LG_CACHELINE_BYTES = L1_CACHE_SHIFT
>  CACHELINE_MASK = (L1_CACHE_BYTES-1)
>  
> -/*
> - * Use dcbz on the complete cache lines in the destination
> - * to set them to zero.  This requires that the destination
> - * area is cacheable.  -- paulus
> - */
> -_GLOBAL(cacheable_memzero)
> - mr  r5,r4
> - li  r4,0
> - addir6,r3,-4
> - cmplwi  0,r5,4
> - blt 7f
> - stwur4,4(r6)
> - beqlr
> - andi.   r0,r6,3
> - add r5,r0,r5
> - subfr6,r0,r6
> - clrlwi  r7,r6,32-LG_CACHELINE_BYTES
> - add r8,r7,r5
> - srwir9,r8,LG_CACHELINE_BYTES
> - addic.  r9,r9,-1/* total number of complete cachelines */
> - ble 2f
> - xorir0,r7,CACHELINE_MASK & ~3
> - srwi.   r0,r0,2
> - beq 3f
> - mtctr   r0
> -4:   stwur4,4(r6)
> - bdnz4b
> -3:   mtctr   r9
> - li  r7,4
> -10:  dcbzr7,r6
> - addir6,r6,CACHELINE_BYTES
> - bdnz10b
> - clrlwi  r5,r8,32-LG_CACHELINE_BYTES
> - addir5,r5,4
> -2:   srwir0,r5,2
> - mtctr   r0
> - bdz 6f
> -1:   stwur4,4(r6)
> - bdnz1b
> -6:   andi.   r5,r5,3
> -7:   cmpwi   0,r5,0
> - beqlr
> - mtctr   r5
> - addir6,r6,3
> -8:   stbur4,1(r6)
> - bdnz8b
> - blr
> -
>  _GLOBAL(memset)
>   rlwimi  r4,r4,8,16,23
>   rlwimi  r4,r4,16,0,15
> @@ -142,85 +94,6 @@ _GLOBAL(memset)
>   bdnz8b
>   blr
>  
> -/*
> - * This version uses dcbz on the complete cache lines in the
> - * destination area to reduce memory traffic.  This requires that
> - * the destination area is cacheable.
> - * We only use this version if the source and dest don't overlap.
> - * -- paulus.
> - */
> -_GLOBAL(cacheable_memcpy)
> - add r7,r3,r5/* test if the src & dst overlap */
> - add r8,r4,r5
> - cmplw   0,r4,r7
> - cmplw   1,r3,r8
> - crand   0,0,4   /* cr0.lt &= cr1.lt */
> - blt memcpy  /* if regions overlap */
> -
> - addir4,r4,-4
> - addir6,r3,-4
> - neg r0,r3
> - andi.   r0,r0,CACHELINE_MASK/* # bytes to start of cache line */
> - beq 58f
> -
> - cmplw   0,r5,r0 /* is this more than total to do? */
> - blt 63f /* if not much to do */
> - andi.   r8,r0,3 /* get it word-aligned first */
> - subfr5,r0,r5
> - mtctr   r8
> - beq+61f
> -70:  lbz r9,4(r4)/* do some bytes */
> - stb r9,4(r6)
> - addir4,r4,1
> - addir6,r6,1
> - bdnz70b
> -61:  srwi.   r0,r0,2
> - mtctr   r0
> - beq 58f
> -72:  lwzur9,4(r4)/* do some words */
> - 

[RFC PATCH 1/2] powerpc: Remove duplicate cacheable_memcpy/memzero functions

2011-11-14 Thread Kyle Moffett
These functions are only used from one place each.  If the cacheable_*
versions really are more efficient, then those changes should be
migrated into the common code instead.

NOTE: The old routines are just flat buggy on kernels that support
  hardware with different cacheline sizes.

Signed-off-by: Kyle Moffett 
---
 arch/powerpc/include/asm/system.h|2 -
 arch/powerpc/kernel/ppc_ksyms.c  |2 -
 arch/powerpc/lib/copy_32.S   |  127 --
 arch/powerpc/mm/ppc_mmu_32.c |2 +-
 drivers/net/ethernet/ibm/emac/core.c |   12 +---
 5 files changed, 3 insertions(+), 142 deletions(-)

diff --git a/arch/powerpc/include/asm/system.h 
b/arch/powerpc/include/asm/system.h
index e30a13d..25389d1 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -189,8 +189,6 @@ static inline void flush_spe_to_thread(struct task_struct 
*t)
 #endif
 
 extern int call_rtas(const char *, int, int, unsigned long *, ...);
-extern void cacheable_memzero(void *p, unsigned int nb);
-extern void *cacheable_memcpy(void *, const void *, unsigned int);
 extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
 extern void bad_page_fault(struct pt_regs *, unsigned long, int);
 extern int die(const char *, struct pt_regs *, long);
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index d3114a7..acba8ce 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -159,8 +159,6 @@ EXPORT_SYMBOL(screen_info);
 #ifdef CONFIG_PPC32
 EXPORT_SYMBOL(timer_interrupt);
 EXPORT_SYMBOL(tb_ticks_per_jiffy);
-EXPORT_SYMBOL(cacheable_memcpy);
-EXPORT_SYMBOL(cacheable_memzero);
 #endif
 
 #ifdef CONFIG_PPC32
diff --git a/arch/powerpc/lib/copy_32.S b/arch/powerpc/lib/copy_32.S
index 55f19f9..6813f80 100644
--- a/arch/powerpc/lib/copy_32.S
+++ b/arch/powerpc/lib/copy_32.S
@@ -69,54 +69,6 @@ CACHELINE_BYTES = L1_CACHE_BYTES
 LG_CACHELINE_BYTES = L1_CACHE_SHIFT
 CACHELINE_MASK = (L1_CACHE_BYTES-1)
 
-/*
- * Use dcbz on the complete cache lines in the destination
- * to set them to zero.  This requires that the destination
- * area is cacheable.  -- paulus
- */
-_GLOBAL(cacheable_memzero)
-   mr  r5,r4
-   li  r4,0
-   addir6,r3,-4
-   cmplwi  0,r5,4
-   blt 7f
-   stwur4,4(r6)
-   beqlr
-   andi.   r0,r6,3
-   add r5,r0,r5
-   subfr6,r0,r6
-   clrlwi  r7,r6,32-LG_CACHELINE_BYTES
-   add r8,r7,r5
-   srwir9,r8,LG_CACHELINE_BYTES
-   addic.  r9,r9,-1/* total number of complete cachelines */
-   ble 2f
-   xorir0,r7,CACHELINE_MASK & ~3
-   srwi.   r0,r0,2
-   beq 3f
-   mtctr   r0
-4: stwur4,4(r6)
-   bdnz4b
-3: mtctr   r9
-   li  r7,4
-10:dcbzr7,r6
-   addir6,r6,CACHELINE_BYTES
-   bdnz10b
-   clrlwi  r5,r8,32-LG_CACHELINE_BYTES
-   addir5,r5,4
-2: srwir0,r5,2
-   mtctr   r0
-   bdz 6f
-1: stwur4,4(r6)
-   bdnz1b
-6: andi.   r5,r5,3
-7: cmpwi   0,r5,0
-   beqlr
-   mtctr   r5
-   addir6,r6,3
-8: stbur4,1(r6)
-   bdnz8b
-   blr
-
 _GLOBAL(memset)
rlwimi  r4,r4,8,16,23
rlwimi  r4,r4,16,0,15
@@ -142,85 +94,6 @@ _GLOBAL(memset)
bdnz8b
blr
 
-/*
- * This version uses dcbz on the complete cache lines in the
- * destination area to reduce memory traffic.  This requires that
- * the destination area is cacheable.
- * We only use this version if the source and dest don't overlap.
- * -- paulus.
- */
-_GLOBAL(cacheable_memcpy)
-   add r7,r3,r5/* test if the src & dst overlap */
-   add r8,r4,r5
-   cmplw   0,r4,r7
-   cmplw   1,r3,r8
-   crand   0,0,4   /* cr0.lt &= cr1.lt */
-   blt memcpy  /* if regions overlap */
-
-   addir4,r4,-4
-   addir6,r3,-4
-   neg r0,r3
-   andi.   r0,r0,CACHELINE_MASK/* # bytes to start of cache line */
-   beq 58f
-
-   cmplw   0,r5,r0 /* is this more than total to do? */
-   blt 63f /* if not much to do */
-   andi.   r8,r0,3 /* get it word-aligned first */
-   subfr5,r0,r5
-   mtctr   r8
-   beq+61f
-70:lbz r9,4(r4)/* do some bytes */
-   stb r9,4(r6)
-   addir4,r4,1
-   addir6,r6,1
-   bdnz70b
-61:srwi.   r0,r0,2
-   mtctr   r0
-   beq 58f
-72:lwzur9,4(r4)/* do some words */
-   stwur9,4(r6)
-   bdnz72b
-
-58:srwi.   r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */
-   clrlwi  r5,r5,32-LG_CACHELINE_BYTES
-   li  r11,4
-   mtctr   r0
-   beq 63f
-53:
-   dcbzr11,r6
-   COPY_16_BYTES
-#if L1_CACHE_BYTES >= 32
-