Re: [PATCH v2 07/76] ARC: [optim] uaccess __{get,put}_user() optimised

2013-01-18 Thread Arnd Bergmann
On Friday 18 January 2013, Vineet Gupta wrote:
> Override asm-generic implementations. We basically gain on 2 fronts
> 
> * checks for alignment no longer needed as we are only doing "unit"
>   sized copies.
> 
>   (Careful observer could argue that While the kernel buffers are aligned,
>the user buffer in theory might not be - however in that case the
>user space is already broken when it tries to deref a hword/word
>straddling word boundary - so we are not making it any worse).
> 
> * __copy_{to,from}_user( ) returns bytes that couldn't be copied,
>   whereas get_user() returns 0 for success or -EFAULT (not size). Thus
>   the code to do leftover bytes calculation can be avoided as well.

Interesting. I had thought that the compiler would be able to simplify
the calculation of the return value inline, but of course it cannot
because it has to be done inside the asm fixup.

> The savings were significant: ~17k of code.
> 
> bloat-o-meter vmlinux_uaccess_pre vmlinux_uaccess_post
> add/remove: 0/4 grow/shrink: 8/118 up/down: 1262/-18758 (-17496)
> ^
> Signed-off-by: Vineet Gupta 

Acked-by: Arnd Bergmann 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 07/76] ARC: [optim] uaccess __{get,put}_user() optimised

2013-01-18 Thread Vineet Gupta
Override asm-generic implementations. We basically gain on 2 fronts

* checks for alignment no longer needed as we are only doing "unit"
  sized copies.

  (Careful observer could argue that While the kernel buffers are aligned,
   the user buffer in theory might not be - however in that case the
   user space is already broken when it tries to deref a hword/word
   straddling word boundary - so we are not making it any worse).

* __copy_{to,from}_user( ) returns bytes that couldn't be copied,
  whereas get_user() returns 0 for success or -EFAULT (not size). Thus
  the code to do leftover bytes calculation can be avoided as well.

The savings were significant: ~17k of code.

bloat-o-meter vmlinux_uaccess_pre vmlinux_uaccess_post
add/remove: 0/4 grow/shrink: 8/118 up/down: 1262/-18758 (-17496)
^
Signed-off-by: Vineet Gupta 
---
 arch/arc/include/asm/uaccess.h |  105 
 1 files changed, 105 insertions(+), 0 deletions(-)

diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index f13bca4..3242082 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -57,6 +57,111 @@
 #define __access_ok(addr, sz)  (unlikely(__kernel_ok) || \
 likely(__user_ok((addr), (sz
 
+/*** Single byte/hword/word copies **/
+
+#define __get_user_fn(sz, u, k)\
+({ \
+   long __ret = 0; /* success by default */\
+   switch (sz) {   \
+   case 1: __arc_get_user_one(*(k), u, "ldb", __ret); break;   \
+   case 2: __arc_get_user_one(*(k), u, "ldw", __ret); break;   \
+   case 4: __arc_get_user_one(*(k), u, "ld", __ret);  break;   \
+   case 8: __arc_get_user_one_64(*(k), u, __ret); break;   \
+   }   \
+   __ret;  \
+})
+
+/*
+ * Returns 0 on success, -EFAULT if not.
+ * @ret already contains 0 - given that errors will be less likely
+ * (hence +r asm constraint below).
+ * In case of error, fixup code will make it -EFAULT
+ */
+#define __arc_get_user_one(dst, src, op, ret)  \
+   __asm__ __volatile__(   \
+   "1: "op"%1,[%2]\n"  \
+   "2: ;nop\n" \
+   "   .section .fixup, \"ax\"\n"  \
+   "   .align 4\n" \
+   "3: mov %0, %3\n"   \
+   "   j   2b\n"   \
+   "   .previous\n"\
+   "   .section __ex_table, \"a\"\n"   \
+   "   .align 4\n" \
+   "   .word 1b,3b\n"  \
+   "   .previous\n"\
+   \
+   : "+r" (ret), "=r" (dst)\
+   : "r" (src), "ir" (-EFAULT))
+
+#define __arc_get_user_one_64(dst, src, ret)   \
+   __asm__ __volatile__(   \
+   "1: ld   %1,[%2]\n" \
+   "4: ld  %R1,[%2, 4]\n"  \
+   "2: ;nop\n" \
+   "   .section .fixup, \"ax\"\n"  \
+   "   .align 4\n" \
+   "3: mov %0, %3\n"   \
+   "   j   2b\n"   \
+   "   .previous\n"\
+   "   .section __ex_table, \"a\"\n"   \
+   "   .align 4\n" \
+   "   .word 1b,3b\n"  \
+   "   .word 4b,3b\n"  \
+   "   .previous\n"\
+   \
+   : "+r" (ret), "=r" (dst)\
+   : "r" (src), "ir" (-EFAULT))
+
+#define __put_user_fn(sz, u, k)\
+({ \
+   long __ret = 0; /* success by default */\
+   switch (sz) {   \
+   case 1: __arc_put_user_one(*(k), u, "stb", __ret); break;   \
+   case 2: __arc_put_user_one(*(k), u, "stw", __ret); break;   \
+   case 4: __arc_put_user_one(*(k), u, "st", __ret);  break;   \
+   case 8: __arc_put_user_one_64(*(k), u, __ret); break;   \
+   }   \
+   __ret;  \
+})
+
+#define __arc_put_user_one(src, dst, op, ret)  \
+   __asm__ __volatile__(   \
+   "1: "op"%1,[%2]\n"  \
+   "2: ;nop\n" \
+   "   .section .fixup, \"ax\"\n"  \
+   "   .align 4\n"   

[PATCH v2 07/76] ARC: [optim] uaccess __{get,put}_user() optimised

2013-01-18 Thread Vineet Gupta
Override asm-generic implementations. We basically gain on 2 fronts

* checks for alignment no longer needed as we are only doing unit
  sized copies.

  (Careful observer could argue that While the kernel buffers are aligned,
   the user buffer in theory might not be - however in that case the
   user space is already broken when it tries to deref a hword/word
   straddling word boundary - so we are not making it any worse).

* __copy_{to,from}_user( ) returns bytes that couldn't be copied,
  whereas get_user() returns 0 for success or -EFAULT (not size). Thus
  the code to do leftover bytes calculation can be avoided as well.

The savings were significant: ~17k of code.

bloat-o-meter vmlinux_uaccess_pre vmlinux_uaccess_post
add/remove: 0/4 grow/shrink: 8/118 up/down: 1262/-18758 (-17496)
^
Signed-off-by: Vineet Gupta vgu...@synopsys.com
---
 arch/arc/include/asm/uaccess.h |  105 
 1 files changed, 105 insertions(+), 0 deletions(-)

diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index f13bca4..3242082 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -57,6 +57,111 @@
 #define __access_ok(addr, sz)  (unlikely(__kernel_ok) || \
 likely(__user_ok((addr), (sz
 
+/*** Single byte/hword/word copies **/
+
+#define __get_user_fn(sz, u, k)\
+({ \
+   long __ret = 0; /* success by default */\
+   switch (sz) {   \
+   case 1: __arc_get_user_one(*(k), u, ldb, __ret); break;   \
+   case 2: __arc_get_user_one(*(k), u, ldw, __ret); break;   \
+   case 4: __arc_get_user_one(*(k), u, ld, __ret);  break;   \
+   case 8: __arc_get_user_one_64(*(k), u, __ret); break;   \
+   }   \
+   __ret;  \
+})
+
+/*
+ * Returns 0 on success, -EFAULT if not.
+ * @ret already contains 0 - given that errors will be less likely
+ * (hence +r asm constraint below).
+ * In case of error, fixup code will make it -EFAULT
+ */
+#define __arc_get_user_one(dst, src, op, ret)  \
+   __asm__ __volatile__(   \
+   1: op%1,[%2]\n  \
+   2: ;nop\n \
+  .section .fixup, \ax\\n  \
+  .align 4\n \
+   3: mov %0, %3\n   \
+  j   2b\n   \
+  .previous\n\
+  .section __ex_table, \a\\n   \
+  .align 4\n \
+  .word 1b,3b\n  \
+  .previous\n\
+   \
+   : +r (ret), =r (dst)\
+   : r (src), ir (-EFAULT))
+
+#define __arc_get_user_one_64(dst, src, ret)   \
+   __asm__ __volatile__(   \
+   1: ld   %1,[%2]\n \
+   4: ld  %R1,[%2, 4]\n  \
+   2: ;nop\n \
+  .section .fixup, \ax\\n  \
+  .align 4\n \
+   3: mov %0, %3\n   \
+  j   2b\n   \
+  .previous\n\
+  .section __ex_table, \a\\n   \
+  .align 4\n \
+  .word 1b,3b\n  \
+  .word 4b,3b\n  \
+  .previous\n\
+   \
+   : +r (ret), =r (dst)\
+   : r (src), ir (-EFAULT))
+
+#define __put_user_fn(sz, u, k)\
+({ \
+   long __ret = 0; /* success by default */\
+   switch (sz) {   \
+   case 1: __arc_put_user_one(*(k), u, stb, __ret); break;   \
+   case 2: __arc_put_user_one(*(k), u, stw, __ret); break;   \
+   case 4: __arc_put_user_one(*(k), u, st, __ret);  break;   \
+   case 8: __arc_put_user_one_64(*(k), u, __ret); break;   \
+   }   \
+   __ret;  \
+})
+
+#define __arc_put_user_one(src, dst, op, ret)  \
+   __asm__ __volatile__(   \
+   1: op%1,[%2]\n  \
+   2: ;nop\n \
+  .section .fixup, \ax\\n  \
+  .align 4\n \
+   3: mov %0, %3\n   \
+  j 

Re: [PATCH v2 07/76] ARC: [optim] uaccess __{get,put}_user() optimised

2013-01-18 Thread Arnd Bergmann
On Friday 18 January 2013, Vineet Gupta wrote:
 Override asm-generic implementations. We basically gain on 2 fronts
 
 * checks for alignment no longer needed as we are only doing unit
   sized copies.
 
   (Careful observer could argue that While the kernel buffers are aligned,
the user buffer in theory might not be - however in that case the
user space is already broken when it tries to deref a hword/word
straddling word boundary - so we are not making it any worse).
 
 * __copy_{to,from}_user( ) returns bytes that couldn't be copied,
   whereas get_user() returns 0 for success or -EFAULT (not size). Thus
   the code to do leftover bytes calculation can be avoided as well.

Interesting. I had thought that the compiler would be able to simplify
the calculation of the return value inline, but of course it cannot
because it has to be done inside the asm fixup.

 The savings were significant: ~17k of code.
 
 bloat-o-meter vmlinux_uaccess_pre vmlinux_uaccess_post
 add/remove: 0/4 grow/shrink: 8/118 up/down: 1262/-18758 (-17496)
 ^
 Signed-off-by: Vineet Gupta vgu...@synopsys.com

Acked-by: Arnd Bergmann a...@arndb.de
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/