Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-27 Thread Jakub Jelinek
On Tue, Jan 27, 2015 at 02:31:14PM +, Jiong Wang wrote:
 testcase changed to execution version, and moved to gcc.dg. the compile time 
 only
 take several seconds. (previously I am using cc1 built by O0 which at most 
 take 24s)
 
 ok to install?

Ok for the testcase.
The config/aarch64/ bits I'll defer to aarch64 maintainers.

 2015-01-19  Ramana Radhakrishnan  ramana.radhakrish...@arm.com
 Jiong Wang  jiong.w...@arm.com
 
   gcc/
 * config/aarch64/aarch64.md (tboptabmode1): Clobber CC reg instead of 
 scratch reg.
 (cboptabmode1): Likewise.
 * config/aarch64/iterators.md (bcond): New define_code_attr.
 
   gcc/testsuite/
 * gcc.dg/long_branch.c: New testcase.

Jakub


Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-27 Thread Marcus Shawcroft
On 27 January 2015 at 14:31, Jiong Wang jiong.w...@arm.com wrote:

 2015-01-19  Ramana Radhakrishnan  ramana.radhakrish...@arm.com
 Jiong Wang  jiong.w...@arm.com

   gcc/
 * config/aarch64/aarch64.md (tboptabmode1): Clobber CC reg instead
 of scratch reg.
 (cboptabmode1): Likewise.
 * config/aarch64/iterators.md (bcond): New define_code_attr.

OK /Marcus


   gcc/testsuite/
 * gcc.dg/long_branch.c: New testcase.


Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-27 Thread Jiong Wang

On 19/01/15 10:58, Jakub Jelinek wrote:


On Mon, Jan 19, 2015 at 10:52:14AM +, Ramana Radhakrishnan wrote:

What is aarch64 specific on the testcase?


The number of if-then-else's required to get the compiler to generate
cmp branch sequences rather than the tbnz instruction.

That doesn't mean the same testcase couldn't be tested on other targets and
perhaps find bugs in there.
That said, if the testcase is too expensive to compile (several seconds is
ok, minutes is not), then perhaps it shouldn't be included at all, or should
be guarded with run_expensive_tests target.

Jakub



testcase changed to execution version, and moved to gcc.dg. the compile time 
only
take several seconds. (previously I am using cc1 built by O0 which at most take 
24s)

ok to install?

Thanks.

2015-01-19  Ramana Radhakrishnan  ramana.radhakrish...@arm.com
Jiong Wang  jiong.w...@arm.com

  gcc/
* config/aarch64/aarch64.md (tboptabmode1): Clobber CC reg instead of 
scratch reg.
(cboptabmode1): Likewise.
* config/aarch64/iterators.md (bcond): New define_code_attr.

  gcc/testsuite/
* gcc.dg/long_branch.c: New testcase.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 597ff8c..1e00396 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -466,13 +466,17 @@
 		   (const_int 0))
 	 (label_ref (match_operand 2  ))
 	 (pc)))
-   (clobber (match_scratch:DI 3 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w3, %w0, %1, #1\;cbz\\t%w3, %l2\;
-  return \tbz\\t%w0, %1, %l2\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	operands[1] = GEN_INT (HOST_WIDE_INT_1U  UINTVAL (operands[1]));
+	return tst\t%w0, %1\;bcond\t%l2;
+  }
+else
+  return tbz\t%w0, %1, %l2;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -32768))
@@ -486,13 +490,21 @@
  (const_int 0))
 			   (label_ref (match_operand 1  ))
 			   (pc)))
-   (clobber (match_scratch:DI 2 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w2, %w0, sizem1, #1\;cbz\\t%w2, %l1\;
-  return \tbz\\t%w0, sizem1, %l1\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	char buf[64];
+	uint64_t val = ((uint64_t ) 1)
+			 (GET_MODE_SIZE (MODEmode) * BITS_PER_UNIT - 1);
+	sprintf (buf, tst\t%%w0, %PRId64, val);
+	output_asm_insn (buf, operands);
+	return bcond\t%l1;
+  }
+else
+  return tbz\t%w0, sizem1, %l1;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -32768))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7dd3917..bd144f9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -823,6 +823,9 @@
 		  (smax s) (umax u)
 		  (smin s) (umin u)])
 
+;; Emit conditional branch instructions.
+(define_code_attr bcond [(eq beq) (ne bne) (lt bne) (ge beq)])
+
 ;; Emit cbz/cbnz depending on comparison type.
 (define_code_attr cbz [(eq cbz) (ne cbnz) (lt cbnz) (ge cbz)])
 
diff --git a/gcc/testsuite/gcc.dg/long_branch.c b/gcc/testsuite/gcc.dg/long_branch.c
new file mode 100644
index 000..f388a80
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/long_branch.c
@@ -0,0 +1,198 @@
+/* { dg-do run } */
+/* { dg-options -O2 -fno-reorder-blocks } */
+
+void abort ();
+
+__attribute__((noinline, noclone)) int
+restore (int a, int b)
+{
+  return a * b;
+}
+
+__attribute__((noinline, noclone)) void
+do_nothing (int *input)
+{
+  *input = restore (*input, 1);
+  return;
+}
+
+#define CASE_ENTRY(n) \
+  case n: \
+sum = sum / (n + 1); \
+sum = restore (sum, n + 1); \
+if (sum == (n + addend)) \
+  break;\
+sum = sum / (n + 2); \
+sum = restore (sum, n + 2); \
+sum = sum / (n + 3); \
+sum = restore (sum, n + 3); \
+sum = sum / (n + 4); \
+sum = restore (sum, n + 4); \
+sum = sum / (n + 5); \
+sum = restore (sum, n + 5); \
+sum = sum / (n + 6); \
+sum = restore (sum, n + 6); \
+sum = sum / (n + 7); \
+sum = restore (sum, n + 7); \
+sum = sum / (n + 8); \
+sum = restore (sum, n + 8); \
+sum = sum / (n + 9); \
+sum = restore (sum, n + 9); \
+sum = sum / (n + 10); \
+sum = restore (sum, n + 10); \
+sum = sum / (n + 11); \
+sum = restore (sum, n + 11); \
+sum = sum / (n + 12); \
+sum = restore (sum, n + 12); \
+sum = sum / (n + 13); \
+sum = restore (sum, n + 13); \
+sum = sum / (n + 14); \
+sum = restore (sum, n + 14); \
+sum = sum / (n + 15); \
+sum = restore (sum, n + 15); \
+sum = sum / (n + 16); \
+sum = restore (sum, n + 16); \
+sum = sum / (n + 17); \
+sum = restore (sum, n + 17); \
+sum = sum / (n + 18); \
+sum = restore (sum, n + 18); \
+sum = sum / (n + 19); \
+sum = restore (sum, n + 19); 

Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-19 Thread Jakub Jelinek
On Mon, Jan 19, 2015 at 10:52:14AM +, Ramana Radhakrishnan wrote:
  What is aarch64 specific on the testcase?
 
 
 The number of if-then-else's required to get the compiler to generate
 cmp branch sequences rather than the tbnz instruction.

That doesn't mean the same testcase couldn't be tested on other targets and
perhaps find bugs in there.
That said, if the testcase is too expensive to compile (several seconds is
ok, minutes is not), then perhaps it shouldn't be included at all, or should
be guarded with run_expensive_tests target.

Jakub


Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-19 Thread Jakub Jelinek
On Mon, Jan 19, 2015 at 10:28:47AM +, Jiong Wang wrote:
 On 14/01/15 22:59, Richard Henderson wrote:
 On 12/15/2014 07:36 AM, Jiong Wang wrote:
 +   char buf[64];
 +   uint64_t val = ((uint64_t) 1)  UINTVAL (operands[1]);
 +   sprintf (buf, tst\t%%w0, %PRId64, val);
 +   output_asm_insn (buf, operands);
 +   return bcond\t%l2;
 Better to simply modify the operand, as in
 
operands[1] = GEN_INT (HOST_WIDE_INT_1U  UINTVAL (operands[1]));
return tst\t%w0, %1\;bcond\t%l2;
 
 thanks, fixed.
 
 ok for trunk ?
 
 gcc/
 
 2015-01-19 Ramana radhakrishnanramana.radhakrish...@arm.com
Jiong wangjiong.w...@arm.com

That is not the right name/email format for ChangeLog entries.

 --- /dev/null
 +++ b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c

What is aarch64 specific on the testcase?  Best would be to turn into into
an executable testcase (add __attribute__((noinline, noclone)) to dec and
define somehow, perhaps with asm volatile with memory clobber in it)
to check that it also works fine at runtime, but even if you don't, putting
it into gcc.c-torture/compile/ might be preferrable over putting it into
aarch64 specific dir.

Jakub


Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-19 Thread Jiong Wang

On 14/01/15 22:59, Richard Henderson wrote:

On 12/15/2014 07:36 AM, Jiong Wang wrote:

+   char buf[64];
+   uint64_t val = ((uint64_t) 1)  UINTVAL (operands[1]);
+   sprintf (buf, tst\t%%w0, %PRId64, val);
+   output_asm_insn (buf, operands);
+   return bcond\t%l2;

Better to simply modify the operand, as in

   operands[1] = GEN_INT (HOST_WIDE_INT_1U  UINTVAL (operands[1]));
   return tst\t%w0, %1\;bcond\t%l2;


thanks, fixed.

ok for trunk ?

gcc/

2015-01-19 Ramana radhakrishnanramana.radhakrish...@arm.com
   Jiong wangjiong.w...@arm.com

   * config/aarch64/aarch64.md (tboptabmode1): Clobber CC reg instead of 
scratch reg.
   (cboptabmode1): Likewise.
   * config/aarch64/iterators.md (bcond): New define_code_attr.
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 597ff8c..1e00396 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -466,13 +466,17 @@
 		   (const_int 0))
 	 (label_ref (match_operand 2  ))
 	 (pc)))
-   (clobber (match_scratch:DI 3 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w3, %w0, %1, #1\;cbz\\t%w3, %l2\;
-  return \tbz\\t%w0, %1, %l2\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	operands[1] = GEN_INT (HOST_WIDE_INT_1U  UINTVAL (operands[1]));
+	return tst\t%w0, %1\;bcond\t%l2;
+  }
+else
+  return tbz\t%w0, %1, %l2;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -32768))
@@ -486,13 +490,21 @@
  (const_int 0))
 			   (label_ref (match_operand 1  ))
 			   (pc)))
-   (clobber (match_scratch:DI 2 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w2, %w0, sizem1, #1\;cbz\\t%w2, %l1\;
-  return \tbz\\t%w0, sizem1, %l1\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	char buf[64];
+	uint64_t val = ((uint64_t ) 1)
+			 (GET_MODE_SIZE (MODEmode) * BITS_PER_UNIT - 1);
+	sprintf (buf, tst\t%%w0, %PRId64, val);
+	output_asm_insn (buf, operands);
+	return bcond\t%l1;
+  }
+else
+  return tbz\t%w0, sizem1, %l1;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -32768))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7dd3917..bd144f9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -823,6 +823,9 @@
 		  (smax s) (umax u)
 		  (smin s) (umin u)])
 
+;; Emit conditional branch instructions.
+(define_code_attr bcond [(eq beq) (ne bne) (lt bne) (ge beq)])
+
 ;; Emit cbz/cbnz depending on comparison type.
 (define_code_attr cbz [(eq cbz) (ne cbnz) (lt cbnz) (ge cbz)])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c
new file mode 100644
index 000..d4782e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c
@@ -0,0 +1,166 @@
+int dec (int);
+
+#define CASE_ENTRY(n) \
+  case n: \
+sum = a / n; \
+sum = sum * (n - 1); \
+sum = dec (sum); \
+sum = sum / (n + 1); \
+sum = dec (sum); \
+sum = sum / (n + 2); \
+sum = dec (sum); \
+sum = sum / (n + 3); \
+sum = dec (sum); \
+sum = sum / (n + 4); \
+sum = dec (sum); \
+sum = sum / (n + 5); \
+sum = dec (sum); \
+sum = sum / (n + 6); \
+sum = dec (sum); \
+sum = sum / (n + 7); \
+sum = dec (sum); \
+sum = sum / (n + 8); \
+sum = dec (sum); \
+sum = sum / (n + 9); \
+sum = dec (sum); \
+sum = sum / (n + 10); \
+sum = dec (sum); \
+sum = sum / (n + 11); \
+sum = dec (sum); \
+sum = sum / (n + 12); \
+sum = dec (sum); \
+sum = sum / (n + 13); \
+sum = dec (sum); \
+sum = sum / (n + 14); \
+sum = dec (sum); \
+sum = sum / (n + 15); \
+sum = dec (sum); \
+sum = sum / (n + 16); \
+sum = dec (sum); \
+sum = sum / (n + 17); \
+sum = dec (sum); \
+sum = sum / (n + 18); \
+sum = dec (sum); \
+sum = sum / (n + 19); \
+sum = dec (sum); \
+sum = sum / (n + 20); \
+sum = dec (sum); \
+sum = sum / (n + 21); \
+sum = dec (sum); \
+sum = sum / (n + 22); \
+sum = dec (sum); \
+sum = sum / (n + 23); \
+sum = dec (sum); \
+sum = sum / (n + 24); \
+sum = dec (sum); \
+sum = sum / (n + 25); \
+sum = dec (sum); \
+sum = sum / (n + 26); \
+sum = dec (sum); \
+sum = sum / (n + 27); \
+sum = dec (sum); \
+sum = sum / (n + 28); \
+sum = dec (sum); \
+sum = sum / (n + 29); \
+sum = dec (sum); \
+sum = sum / (n + 30); \
+sum = dec (sum); \
+sum = sum / (n + 31); \
+break;
+
+int
+cbranch (int a, int b, int c, int d, long long addend)
+{
+  long long sum;
+  if (a  0x2)
+{
+start:
+  sum = b * c;
+  sum = sum + 

Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-19 Thread Jiong Wang


On 19/01/15 10:34, Jakub Jelinek wrote:

On Mon, Jan 19, 2015 at 10:28:47AM +, Jiong Wang wrote:

On 14/01/15 22:59, Richard Henderson wrote:

On 12/15/2014 07:36 AM, Jiong Wang wrote:

+   char buf[64];
+   uint64_t val = ((uint64_t) 1)  UINTVAL (operands[1]);
+   sprintf (buf, tst\t%%w0, %PRId64, val);
+   output_asm_insn (buf, operands);
+   return bcond\t%l2;

Better to simply modify the operand, as in

   operands[1] = GEN_INT (HOST_WIDE_INT_1U  UINTVAL (operands[1]));
   return tst\t%w0, %1\;bcond\t%l2;

thanks, fixed.

ok for trunk ?

gcc/

2015-01-19 Ramana radhakrishnanramana.radhakrish...@arm.com
Jiong wangjiong.w...@arm.com

That is not the right name/email format for ChangeLog entries.


--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c

What is aarch64 specific on the testcase?  Best would be to turn into into
an executable testcase (add __attribute__((noinline, noclone)) to dec and
define somehow, perhaps with asm volatile with memory clobber in it)
to check that it also works fine at runtime, but even if you don't, putting
it into gcc.c-torture/compile/ might be preferrable over putting it into
aarch64 specific dir.

Jakub


Jakub,

  Thanks for review. As I have written in the initial email, the testcase 
included in the patch
  is for reproduce/record purpose only. I do not plan to commit it. it could 
verify the long branch situation,
  while because of the code is quite big, it takes a couple of seconds to 
compile, so will not commit it.

  change log updated

2015-01-19 Ramana Radhakrishnan ramana.radhakrish...@arm.com
   Jiong Wang jiong.w...@arm.com

  gcc/
* config/aarch64/aarch64.md (tboptabmode1): Clobber CC reg instead of 
scratch reg.
(cboptabmode1): Likewise.
* config/aarch64/iterators.md (bcond): New define_code_attr.
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 597ff8c..1e00396 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -466,13 +466,17 @@
 		   (const_int 0))
 	 (label_ref (match_operand 2  ))
 	 (pc)))
-   (clobber (match_scratch:DI 3 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w3, %w0, %1, #1\;cbz\\t%w3, %l2\;
-  return \tbz\\t%w0, %1, %l2\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	operands[1] = GEN_INT (HOST_WIDE_INT_1U  UINTVAL (operands[1]));
+	return tst\t%w0, %1\;bcond\t%l2;
+  }
+else
+  return tbz\t%w0, %1, %l2;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -32768))
@@ -486,13 +490,21 @@
  (const_int 0))
 			   (label_ref (match_operand 1  ))
 			   (pc)))
-   (clobber (match_scratch:DI 2 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w2, %w0, sizem1, #1\;cbz\\t%w2, %l1\;
-  return \tbz\\t%w0, sizem1, %l1\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	char buf[64];
+	uint64_t val = ((uint64_t ) 1)
+			 (GET_MODE_SIZE (MODEmode) * BITS_PER_UNIT - 1);
+	sprintf (buf, tst\t%%w0, %PRId64, val);
+	output_asm_insn (buf, operands);
+	return bcond\t%l1;
+  }
+else
+  return tbz\t%w0, sizem1, %l1;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -32768))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7dd3917..bd144f9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -823,6 +823,9 @@
 		  (smax s) (umax u)
 		  (smin s) (umin u)])
 
+;; Emit conditional branch instructions.
+(define_code_attr bcond [(eq beq) (ne bne) (lt bne) (ge beq)])
+
 ;; Emit cbz/cbnz depending on comparison type.
 (define_code_attr cbz [(eq cbz) (ne cbnz) (lt cbnz) (ge cbz)])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c
new file mode 100644
index 000..d4782e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c
@@ -0,0 +1,166 @@
+int dec (int);
+
+#define CASE_ENTRY(n) \
+  case n: \
+sum = a / n; \
+sum = sum * (n - 1); \
+sum = dec (sum); \
+sum = sum / (n + 1); \
+sum = dec (sum); \
+sum = sum / (n + 2); \
+sum = dec (sum); \
+sum = sum / (n + 3); \
+sum = dec (sum); \
+sum = sum / (n + 4); \
+sum = dec (sum); \
+sum = sum / (n + 5); \
+sum = dec (sum); \
+sum = sum / (n + 6); \
+sum = dec (sum); \
+sum = sum / (n + 7); \
+sum = dec (sum); \
+sum = sum / (n + 8); \
+sum = dec (sum); \
+sum = sum / (n + 9); \
+sum = dec (sum); \
+sum = sum / (n + 10); \
+sum = dec (sum); \
+sum = sum / (n + 11); \
+sum = dec (sum); \
+sum = sum / (n + 12); \
+sum = dec (sum); \
+sum = sum / (n + 13); 

Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-19 Thread Ramana Radhakrishnan
 What is aarch64 specific on the testcase?


The number of if-then-else's required to get the compiler to generate
cmp branch sequences rather than the tbnz instruction.

Ramana


Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-14 Thread Richard Henderson
On 12/15/2014 07:36 AM, Jiong Wang wrote:
 + char buf[64];
 + uint64_t val = ((uint64_t) 1)  UINTVAL (operands[1]);
 + sprintf (buf, tst\t%%w0, %PRId64, val);
 + output_asm_insn (buf, operands);
 + return bcond\t%l2;

Better to simply modify the operand, as in

  operands[1] = GEN_INT (HOST_WIDE_INT_1U  UINTVAL (operands[1]));
  return tst\t%w0, %1\;bcond\t%l2;


r~


Re: [PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2015-01-14 Thread Jiong Wang

On 15/12/14 15:36, Jiong Wang wrote:


from the discussion here
https://gcc.gnu.org/ml/gcc-patches/2014-11/msg01949.html

the other problem it exposed is the unnecessary clobber of register x19
which
is a callee-saved register, then there are unnecessary push/pop in
pro/epilogue.

the reason comes from the following pattern:

(define_insn tboptabmode1
(define_insn cboptabmode1

they always declare (clobber (match_scratch:DI 3 =r)) while that
register is
used only when get_attr_length (insn) == 8.

actually, we could clobber CC register instead of scratch register to
avoid wasting
of general purpose registers.

this patch fix this, and give slightly improvement on spec2k.
bootstrap OK, no regression on aarch64 bare-metal.

ok for trunk?

the testcase included in the patch is for verification purpose only.
it could verify the long branch situation, while because of the code is
very big,
it takes a couple of seconds to compile. will not commit it.

gcc/
2014-12-15 Ramana Radhakrishnan ramana.radhakrish...@arm.com
  Jiong Wang jiong.w...@arm.com

* config/aarch64/aarch64.md (tboptabmode1): Clobber CC reg
instead of scratch reg.
(cboptabmode1): Likewise.
* config/aarch64/iterators.md (bcond): New define_code_attr.


Ping~




[PATCH][AArch64] Improve bit-test-branch pattern to avoid unnecessary register clobber

2014-12-15 Thread Jiong Wang

from the discussion here
  https://gcc.gnu.org/ml/gcc-patches/2014-11/msg01949.html

the other problem it exposed is the unnecessary clobber of register x19 
which
is a callee-saved register, then there are unnecessary push/pop in 
pro/epilogue.


the reason comes from the following pattern:

(define_insn tboptabmode1
(define_insn cboptabmode1

they always declare (clobber (match_scratch:DI 3 =r)) while that 
register is

used only when get_attr_length (insn) == 8.

actually, we could clobber CC register instead of scratch register to 
avoid wasting

of general purpose registers.

this patch fix this, and give slightly improvement on spec2k.
bootstrap OK, no regression on aarch64 bare-metal.

ok for trunk?

the testcase included in the patch is for verification purpose only.
it could verify the long branch situation, while because of the code is 
very big,

it takes a couple of seconds to compile. will not commit it.

gcc/
  2014-12-15 Ramana Radhakrishnan ramana.radhakrish...@arm.com
Jiong Wang jiong.w...@arm.com

  * config/aarch64/aarch64.md (tboptabmode1): Clobber CC reg 
instead of scratch reg.

  (cboptabmode1): Likewise.
  * config/aarch64/iterators.md (bcond): New define_code_attr.diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 597ff8c..abf8e3f 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -466,13 +466,20 @@
 		   (const_int 0))
 	 (label_ref (match_operand 2  ))
 	 (pc)))
-   (clobber (match_scratch:DI 3 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w3, %w0, %1, #1\;cbz\\t%w3, %l2\;
-  return \tbz\\t%w0, %1, %l2\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	char buf[64];
+	uint64_t val = ((uint64_t) 1)  UINTVAL (operands[1]);
+	sprintf (buf, tst\t%%w0, %PRId64, val);
+	output_asm_insn (buf, operands);
+	return bcond\t%l2;
+  }
+else
+  return tbz\t%w0, %1, %l2;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 2) (pc)) (const_int -32768))
@@ -486,13 +493,21 @@
  (const_int 0))
 			   (label_ref (match_operand 1  ))
 			   (pc)))
-   (clobber (match_scratch:DI 2 =r))]
+   (clobber (reg:CC CC_REGNUM))]
   
-  *
-  if (get_attr_length (insn) == 8)
-return \ubfx\\t%w2, %w0, sizem1, #1\;cbz\\t%w2, %l1\;
-  return \tbz\\t%w0, sizem1, %l1\;
-  
+  {
+if (get_attr_length (insn) == 8)
+  {
+	char buf[64];
+	uint64_t val = ((uint64_t ) 1)
+			 (GET_MODE_SIZE (MODEmode) * BITS_PER_UNIT - 1);
+	sprintf (buf, tst\t%%w0, %PRId64, val);
+	output_asm_insn (buf, operands);
+	return bcond\t%l1;
+  }
+else
+  return tbz\t%w0, sizem1, %l1;
+  }
   [(set_attr type branch)
(set (attr length)
 	(if_then_else (and (ge (minus (match_dup 1) (pc)) (const_int -32768))
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 7dd3917..bd144f9 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -823,6 +823,9 @@
 		  (smax s) (umax u)
 		  (smin s) (umin u)])
 
+;; Emit conditional branch instructions.
+(define_code_attr bcond [(eq beq) (ne bne) (lt bne) (ge beq)])
+
 ;; Emit cbz/cbnz depending on comparison type.
 (define_code_attr cbz [(eq cbz) (ne cbnz) (lt cbnz) (ge cbz)])
 
diff --git a/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c
new file mode 100644
index 000..d4782e9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/long_range_bit_test_branch_1.c
@@ -0,0 +1,166 @@
+int dec (int);
+
+#define CASE_ENTRY(n) \
+  case n: \
+sum = a / n; \
+sum = sum * (n - 1); \
+sum = dec (sum); \
+sum = sum / (n + 1); \
+sum = dec (sum); \
+sum = sum / (n + 2); \
+sum = dec (sum); \
+sum = sum / (n + 3); \
+sum = dec (sum); \
+sum = sum / (n + 4); \
+sum = dec (sum); \
+sum = sum / (n + 5); \
+sum = dec (sum); \
+sum = sum / (n + 6); \
+sum = dec (sum); \
+sum = sum / (n + 7); \
+sum = dec (sum); \
+sum = sum / (n + 8); \
+sum = dec (sum); \
+sum = sum / (n + 9); \
+sum = dec (sum); \
+sum = sum / (n + 10); \
+sum = dec (sum); \
+sum = sum / (n + 11); \
+sum = dec (sum); \
+sum = sum / (n + 12); \
+sum = dec (sum); \
+sum = sum / (n + 13); \
+sum = dec (sum); \
+sum = sum / (n + 14); \
+sum = dec (sum); \
+sum = sum / (n + 15); \
+sum = dec (sum); \
+sum = sum / (n + 16); \
+sum = dec (sum); \
+sum = sum / (n + 17); \
+sum = dec (sum); \
+sum = sum / (n + 18); \
+sum = dec (sum); \
+sum = sum / (n + 19); \
+sum = dec (sum); \
+sum = sum / (n + 20); \
+sum = dec (sum); \
+sum = sum / (n + 21); \
+sum = dec (sum); \
+sum = sum / (n + 22); \
+sum = dec (sum); \
+sum = sum / (n + 23); \
+sum = dec (sum); \
+sum = sum / (n + 24); \
+