date:20151210


On 12/10/2015 05:52 PM, Martin Sebor wrote:


The updated patch is attached.


Ok.


Bernd

[PATCH 2/2] [graphite] update required isl versions

2015-12-10 Thread Sebastian Pop

we now check the isl version, as there are no real differences in existing files
in between isl 0.14 and isl 0.15.
---
 config/isl.m4 |  41 +++--
 configure | 112 --
 2 files changed, 123 insertions(+), 30 deletions(-)

diff --git a/config/isl.m4 b/config/isl.m4
index 459fac1..886b0e4 100644
--- a/config/isl.m4
+++ b/config/isl.m4
@@ -19,23 +19,23 @@
 
 # ISL_INIT_FLAGS ()
 # -
-# Provide configure switches for ISL support.
+# Provide configure switches for isl support.
 # Initialize isllibs/islinc according to the user input.
 AC_DEFUN([ISL_INIT_FLAGS],
 [
   AC_ARG_WITH([isl-include],
 [AS_HELP_STRING(
   [--with-isl-include=PATH],
-  [Specify directory for installed ISL include files])])
+  [Specify directory for installed isl include files])])
   AC_ARG_WITH([isl-lib],
 [AS_HELP_STRING(
   [--with-isl-lib=PATH],
-  [Specify the directory for the installed ISL library])])
+  [Specify the directory for the installed isl library])])
 
   AC_ARG_ENABLE(isl-version-check,
 [AS_HELP_STRING(
   [--disable-isl-version-check],
-  [disable check for ISL version])],
+  [disable check for isl version])],
 ENABLE_ISL_CHECK=$enableval,
 ENABLE_ISL_CHECK=yes)
   
@@ -58,15 +58,15 @@ AC_DEFUN([ISL_INIT_FLAGS],
   if test "x${with_isl_lib}" != x; then
 isllibs="-L$with_isl_lib"
   fi
-  dnl If no --with-isl flag was specified and there is in-tree ISL
+  dnl If no --with-isl flag was specified and there is in-tree isl
   dnl source, set up flags to use that and skip any version tests
-  dnl as we cannot run them before building ISL.
+  dnl as we cannot run them before building isl.
   if test "x${islinc}" = x && test "x${isllibs}" = x \
  && test -d ${srcdir}/isl; then
 isllibs='-L$$r/$(HOST_SUBDIR)/isl/'"$lt_cv_objdir"' '
 islinc='-I$$r/$(HOST_SUBDIR)/isl/include -I$$s/isl/include'
 ENABLE_ISL_CHECK=no
-AC_MSG_WARN([using in-tree ISL, disabling version check])
+AC_MSG_WARN([using in-tree isl, disabling version check])
   fi
 
   isllibs="${isllibs} -lisl"
@@ -75,7 +75,7 @@ AC_DEFUN([ISL_INIT_FLAGS],
 
 # ISL_REQUESTED (ACTION-IF-REQUESTED, ACTION-IF-NOT)
 # 
-# Provide actions for failed ISL detection.
+# Provide actions for failed isl detection.
 AC_DEFUN([ISL_REQUESTED],
 [
   AC_REQUIRE([ISL_INIT_FLAGS])
@@ -106,12 +106,31 @@ AC_DEFUN([ISL_CHECK_VERSION],
 LDFLAGS="${_isl_saved_LDFLAGS} ${isllibs}"
 LIBS="${_isl_saved_LIBS} -lisl"
 
-AC_MSG_CHECKING([for compatible ISL])
-AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], [[;]])],
+AC_MSG_CHECKING([for recommended isl 0.15])
+AC_TRY_RUN([#include 
+#include 
+int main() {
+  if (strncmp (_GENERATED_STDINT_H, "isl 0.15", 8))
+return 1;
+  return 0;
+}],
[gcc_cv_isl=yes],
-   [gcc_cv_isl=no])
+   [gcc_cv_isl=no], [gcc_cv_isl=no])
 AC_MSG_RESULT([$gcc_cv_isl])
 
+if test "${gcc_cv_isl}" = no ; then
+   AC_MSG_CHECKING([for deprecated isl 0.14])
+   AC_TRY_RUN([#include 
+   #include 
+   int main() {
+ if (strncmp (_GENERATED_STDINT_H, "isl 0.14", 8))
+   return 1;
+ return 0;
+   }],
+   [gcc_cv_isl=yes],
+   [gcc_cv_isl=no], [gcc_cv_isl=no])
+AC_MSG_RESULT([$gcc_cv_isl, recommended isl version is 0.15, minimum 
required isl version 0.14 is deprecated])
+fi
 CFLAGS=$_isl_saved_CFLAGS
 LDFLAGS=$_isl_saved_LDFLAGS
 LIBS=$_isl_saved_LIBS
diff --git a/configure b/configure
index 090615f..4284ba7 100755
--- a/configure
+++ b/configure
@@ -1492,7 +1492,7 @@ Optional Features:
   build static libjava [default=no]
   --enable-bootstrap  enable bootstrapping [yes if native build]
   --disable-isl-version-check
-  disable check for ISL version
+  disable check for isl version
   --enable-ltoenable link time optimization support
   --enable-linker-plugin-configure-flags=FLAGS
   additional flags for configuring linker plugins
@@ -1553,8 +1553,8 @@ Optional Packages:
   package. Equivalent to
   --with-isl-include=PATH/include plus
   --with-isl-lib=PATH/lib
-  --with-isl-include=PATH Specify directory for installed ISL include files
-  --with-isl-lib=PATH Specify the directory for the installed ISL library
+  --with-isl-include=PATH Specify directory for installed isl include files
+  --with-isl-lib=PATH Specify the directory for the installed isl library
   --with-build-sysroot=SYSROOT
   use sysroot as the system root during the build

[PATCH 1/2] [graphite] document minimal required version for isl

2015-12-10 Thread Sebastian Pop

also update ISL to isl as requested by its author Sven Verdoolaege.
---
 gcc/doc/install.texi | 9 +
 gcc/doc/invoke.texi  | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/gcc/doc/install.texi b/gcc/doc/install.texi
index 0b71bef..b43a3ec 100644
--- a/gcc/doc/install.texi
+++ b/gcc/doc/install.texi
@@ -383,14 +383,15 @@ installed but it is not in your default library search 
path, the
 @option{--with-mpc} configure option should be used.  See also
 @option{--with-mpc-lib} and @option{--with-mpc-include}.
 
-@item ISL Library version 0.15 or 0.14.
+@item isl Library version 0.15 or 0.14.
 
+Minimal isl version supported is 0.14 and it is highly recommended to use 0.15.
 Necessary to build GCC with the Graphite loop optimizations.
 It can be downloaded from @uref{ftp://gcc.gnu.org/pub/gcc/infrastructure/}.
-If an ISL source distribution is found
+If an isl source distribution is found
 in a subdirectory of your GCC sources named @file{isl}, it will be
 built together with GCC.  Alternatively, the @option{--with-isl} configure
-option should be used if ISL is not installed in your default library
+option should be used if isl is not installed in your default library
 search path.
 
 @end table
@@ -1850,7 +1851,7 @@ a cross compiler, they will not be used to configure 
target libraries.
 @item --with-isl=@var{pathname}
 @itemx --with-isl-include=@var{pathname}
 @itemx --with-isl-lib=@var{pathname}
-If you do not have the ISL library installed in a standard location and you
+If you do not have the isl library installed in a standard location and you
 want to build GCC, you can explicitly specify the directory where it is
 installed (@samp{--with-isl=@/@var{islinstalldir}}). The
 @option{--with-isl=@/@var{islinstalldir}} option is shorthand for
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5256031..7ae0849 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -8939,12 +8939,12 @@ Enable the identity transformation for graphite.  For 
every SCoP we generate
 the polyhedral representation and transform it back to gimple.  Using
 @option{-fgraphite-identity} we can check the costs or benefits of the
 GIMPLE -> GRAPHITE -> GIMPLE transformation.  Some minimal optimizations
-are also performed by the code generator ISL, like index splitting and
+are also performed by the code generator isl, like index splitting and
 dead code elimination in loops.
 
 @item -floop-nest-optimize
 @opindex floop-nest-optimize
-Enable the ISL based loop nest optimizer.  This is a generic loop nest
+Enable the isl based loop nest optimizer.  This is a generic loop nest
 optimizer based on the Pluto optimization algorithms.  It calculates a loop
 structure optimized for data-locality and parallelism.  This option
 is experimental.
-- 
1.9.1

Re: [PATCH] testsuite/lib/multline.exp: show test name and line numbers


On 12/10/2015 03:56 PM, David Malcolm wrote:

The following patch updates multiline.exp to use the global
   $testname_with_flags
as a prefix in such results.

I also dropped the printing of the index in favor of printing the line
numbers enclosed within dg-{begin|end}-multiline-output.

After the patch, we get result lines like this:
  PASS: gcc.dg/plugin/diagnostic-test-show-locus-bw.c 
-fplugin=./diagnostic_plugin_test_show_locus.so  expected multiline pattern lines 17-18 
was found: "\s*myvar = myvar\.x;.*\n   ~\^~\n"


This seems to be an improvement, so OK. I'm not sure what the value of 
printing the regexp in the output is, I tend to see it as clutter and 
would favour another patch to remove it.



Bernd

Re: [RFA] [PATCH] [PR tree-optimization/68619] Avoid direct cfg cleanups in tree-ssa-dom.c [4/3] v2

2015-12-10 Thread Uros Bizjak

> Finally the mechanical changes necessary due to the API change in the walker.

You forgot to change the graphite part, as in the attached patch.

2015-12-10  Uros Bizjak  

PR tree-optimization/68619
* graphite-scop-detection.c (gather_bbs::before_dom_children):
Change return type to an edge.  Always return NULL.

OK for mainline after successful bootstrap and regtest?

Uros.
Index: graphite-scop-detection.c
===
--- graphite-scop-detection.c   (revision 231529)
+++ graphite-scop-detection.c   (working copy)
@@ -1828,7 +1828,7 @@ class gather_bbs : public dom_walker
 public:
   gather_bbs (cdi_direction, scop_p);
 
-  virtual void before_dom_children (basic_block);
+  virtual edge before_dom_children (basic_block);
   virtual void after_dom_children (basic_block);
 
 private:
@@ -1844,11 +1844,11 @@ gather_bbs::gather_bbs (cdi_direction direction, s
 /* Call-back for dom_walk executed before visiting the dominated
blocks.  */
 
-void
+edge
 gather_bbs::before_dom_children (basic_block bb)
 {
   if (!bb_in_sese_p (bb, scop->scop_info->region))
-return;
+return NULL;
 
   gcond *stmt = single_pred_cond_non_loop_exit (bb);
 
@@ -1868,7 +1868,7 @@ gather_bbs::before_dom_children (basic_block bb)
 
   gimple_poly_bb_p gbb = try_generate_gimple_bb (scop, bb);
   if (!gbb)
-return;
+return NULL;
 
   GBB_CONDITIONS (gbb) = conditions.copy ();
   GBB_CONDITION_CASES (gbb) = cases.copy ();
@@ -1880,6 +1880,7 @@ gather_bbs::before_dom_children (basic_block bb)
   data_reference_p dr;
   FOR_EACH_VEC_ELT (gbb->data_refs, i, dr)
 scop->drs.safe_push (dr_info (dr, pbb));
+  return NULL;
 }
 
 /* Call-back for dom_walk executed after visiting the dominated

Re: [PATCH 4/4] Add -Wmisleading-indentation to -Wall

2015-12-10 Thread David Malcolm

On Thu, 2015-10-29 at 11:38 -0600, Jeff Law wrote:
> On 10/29/2015 10:49 AM, David Malcolm wrote:
> > Our documentation describes -Wall as enabling "all the warnings about
> > constructions that some users consider questionable, and that are easy to 
> > avoid
> > (or modify to prevent the warning), even in conjunction with macros."
> >
> > I believe that -Wmisleading-indentation meets these criteria, and is
> > likely to be of benefit to users who may not read release notes; it
> > warns for indentation that's misleading, but not for indentation
> > that's merely bad: the former are places where a user will likely
> > want to fix the code.
> >
> > The fix is usually easy and obvious: fix the misleadingly-indented
> > code.  If that isn't an option for some reason, pragmas can be used to
> > turn off the warning for a particular fragment of code:
> >
> >#pragma GCC diagnostic push
> >#pragma GCC diagnostic ignored "-Wmisleading-indentation"
> >  if (flag)
> >x = 3;
> >y = 2;
> >#pragma GCC diagnostic pop
> >
> > -Wmisleading-indentation has been tested with a variety of indentation
> > styles (see gcc/testsuite/c-c++-common/Wmisleading-indentation.c)
> > and on a variety of real-world projects.  For example, in:
> >https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg119790.html
> > Patrick reports:
> > "Tested by building the linux, git, vim, sqlite and gdb-binutils sources
> >   with -Wmisleading-indentation."
> >
> > With the tweak earlier in this kit I believe we now have a good
> > enough signal:noise ratio for this warning to be widely used; hence this
> > patch adds the warning to -Wall.
> >
> > Bootstrapped with x86_64-pc-linux-gnu.
> >
> > OK for trunk?
> >
> > gcc/c-family/ChangeLog:
> > * c.opt (Wmisleading-indentation): Add to -Wall for C and C++.
> >
> > gcc/ChangeLog:
> > * doc/invoke.texi (-Wall): Add -Wmisleading-indentation to the
> > list.
> > (-Wmisleading-indentation): Update documentation to reflect
> > being enabled by -Wall in C/C++.
> I'm sure we'll get some grief for this :-)
> 
> Approved once we're clean in GCC.  I'm going to explicitly say that 
> we'll have to watch for fallout, particularly as we start getting 
> feedback from Debian & Fedora mass-rebuilds as we approach release time. 
>   If the fallout is too bad, we'll have to reconsider.

I believe we're now clean; I've committed this to trunk (*without* the
blank-lines heuristic/levels idea) as r231539, having
bootstrapped on x86_64-pc-linux-gnu (I've also successfully
performed an all-configs build with it, although that was a while back
now).

> I'll pre-approve patches which fix anything caught by this option in GCC 
> as long as the fix just adjusts whitespace :-)
> 
> jeff
> 
>

Re: [PATCH] update -Wall and -Wextra documentation

2015-12-10 Thread Martin Sebor


On 12/10/2015 04:57 AM, Bernd Schmidt wrote:

On 12/10/2015 12:07 AM, Martin Sebor wrote:


* invoke.texi (Warning Options): Update -Wall options.  Clarify
when some -Wextra options are enabled.  Add -Wplacement-new example.


I tried to check this list against c.opt - I figure this should contain
essentially the ones that are have an EnabledBy(Wall), plus whatever
logic there is in c-opts.


Thanks for double-checking me!  FWIW, I did something similar.
I also verified those I wasn't sure about them using simple
examples, but it's possible I overlooked something.




+-Wduplicated-cond  @gol


I don't see this one as enabled by Wall, and


Yes, this one should not be there.  I removed it in the updated
patch.  (Note the first patch didn't add it, just moved it up
the list).

Incidentally, testing this using the example from the manual
triggers an ICE.  I raised c/68839 for the record and closed
it as a duplicate of c/68473.




+-Wplacement-new @r{(only for C++)} @gol


This one appears to be on by default?


Hmm.  You're right, it is.  I'm not sure that's what we want
but let me confirm that with Jason and deal with it separately.
I've corrected the patch to reflect the status quo.




+In C++, this warning is also enabled by @option{-Wall}.  In C, it is
also
+enabled by @option{-Wextra}; to get the other warnings of
@option{-Wextra}
+without this warning, use @option{-Wextra -Wno-sign-compare}.


Is the last part of the sentence really necessary? It kind of follows
from the rest of the documentation and we don't spell this out for other
-Wextra options.


I don't think explaining how to turn the option off is necessary
or adds any value but the text was already there and I didn't want
to make gratuitous changes unrelated to the main purpose of the
patch.  But if you think it's appropriate I'm fine with removing
it and have done that in the updated patch.

The updated patch is attached.

Martin
2015-12-09  Martin Sebor  

	* invoke.texi (Warning Options): Update -Wall options.  Clarify
	when some -Wextra options are enabled.  Add -Wplacement-new example.

Index: invoke.texi
===
--- invoke.texi	(revision 231525)
+++ invoke.texi	(working copy)
@@ -3542,18 +3542,22 @@
 
 @gccoptlist{-Waddress   @gol
 -Warray-bounds=1 @r{(only with} @option{-O2}@r{)}  @gol
+-Wbool-compare  @gol
 -Wc++11-compat  -Wc++14-compat@gol
 -Wchar-subscripts  @gol
+-Wcomment  @gol
 -Wenum-compare @r{(in C/ObjC; this is on by default in C++)} @gol
+-Wformat   @gol
+-Wimplicit @r{(C and Objective-C only)} @gol
 -Wimplicit-int @r{(C and Objective-C only)} @gol
 -Wimplicit-function-declaration @r{(C and Objective-C only)} @gol
--Wbool-compare  @gol
--Wduplicated-cond  @gol
--Wcomment  @gol
--Wformat   @gol
+-Winit-self @r{(only for C++)} @gol
+-Wlogical-not-parentheses
 -Wmain @r{(only for C/ObjC and unless} @option{-ffreestanding}@r{)}  @gol
 -Wmaybe-uninitialized @gol
+-Wmemset-transposed-args @gol
 -Wmissing-braces @r{(only for C/ObjC)} @gol
+-Wnarrowing @r{(only for C++)}  @gol
 -Wnonnull  @gol
 -Wopenmp-simd @gol
 -Wparentheses  @gol
@@ -3562,6 +3566,7 @@
 -Wreturn-type  @gol
 -Wsequence-point  @gol
 -Wsign-compare @r{(only in C++)}  @gol
+-Wsizeof-pointer-memaccess @gol
 -Wstrict-aliasing  @gol
 -Wstrict-overflow=1  @gol
 -Wswitch  @gol
@@ -3599,10 +3604,10 @@
 -Wmissing-parameter-type @r{(C only)}  @gol
 -Wold-style-declaration @r{(C only)}  @gol
 -Woverride-init  @gol
--Wsign-compare  @gol
+-Wsign-compare @r{(C only)} @gol
 -Wtype-limits  @gol
 -Wuninitialized  @gol
--Wshift-negative-value  @gol
+-Wshift-negative-value @r{(in C++03 and in C99 and newer)}  @gol
 -Wunused-parameter @r{(only with} @option{-Wunused} @r{or} @option{-Wall}@r{)} @gol
 -Wunused-but-set-parameter @r{(only with} @option{-Wunused} @r{or} @option{-Wall}@r{)}  @gol
 }
@@ -4589,7 +4594,6 @@
 if (p->q != NULL) @{ @dots{} @}
 else if (p->q != NULL) @{ @dots{} @}
 @end smallexample
-This warning is enabled by @option{-Wall}.
 
 @item -Wframe-address
 @opindex Wno-frame-address
@@ -4896,8 +4900,15 @@
 @opindex Wno-placement-new
 Warn about placement new expressions with undefined behavior, such as
 constructing an object in a buffer that is smaller than the type of
-the object.
-
+the object.  For example, the placement new expression below is diagnosed
+because it attempts to construct an array of 64 integers in a buffer only
+64 bytes large.
+@smallexample
+char buf [64];
+new (buf) int[64];
+@end smallexample
+This warning is enabled by default.
+  
 @item -Wpointer-arith
 @opindex Wpointer-arith
 @opindex Wno-pointer-arith
@@ -5114,8 +5125,8 @@
 @cindex signed and unsigned values, comparison warning
 Warn when a comparison between signed and unsigned values could produce
 an incorrect result when the signed value is converted to unsigned.
-This warning is also enabled by @option{-Wextra}; to get the other warnings
-of @option{-Wextra} without this warning, use

Re: [hsa 2/10] Modifications to libgomp proper

Hi,

thanks for the feedback.  I have incorporated most of it into the
branch (the diff is below) but  also have a few questions.

On Wed, Dec 09, 2015 at 12:35:36PM +0100, Jakub Jelinek wrote:
> On Mon, Dec 07, 2015 at 12:19:57PM +0100, Martin Jambor wrote:
> > +/* Flag set when the subsequent element in the device-specific argument
> > +   values.  */
> > +#define GOMP_TARGET_ARG_SUBSEQUENT_PARAM   (1 << 7)
> > +
> > +/* Bitmask to apply to a target argument to find out the value identifier. 
> >  */
> > +#define GOMP_TARGET_ARG_ID_MASK(((1 << 8) - 1) << 8)
> > +/* Target argument index of NUM_TEAMS.  */
> > +#define GOMP_TARGET_ARG_NUM_TEAMS  (1 << 8)
> > +/* Target argument index of THREAD_LIMIT.  */
> > +#define GOMP_TARGET_ARG_THREAD_LIMIT   (2 << 8)
> 
> I meant that these two would be just special, passed as the first two
> pointers in the array, without the markup.  Because, otherwise you either
> need to use GOMP_TARGET_ARG_SUBSEQUENT_PARAM for these always, or for 32-bit
> arches and for 64-bit ones shift often at runtime.  Having the markup even
> for them is perhaps cleaner, but less efficient, so if you really want to go
> that way, please make sure you handle it properly for 32-bit pointers
> architectures though.  num_teams or thread_limit could be > 32767 or >
> 65535.

I see, I prefer the clean approach, even if it is more work, this
interface looks like it is going to be extended in the future.  But I
am wondering whether embedding the value into the identifier element
is actually worth it.  The passed array is going to be a small local
variable and I wonder whether there is going to be any benefit in it
having two elements instead of four (or four instead of six for
gridified kernels), especially if it means introducing control flow on
the part of the caller.  But if you really want it that way, I will
implement that.

> 
> > -static void
> > -gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
> > -  void **hostaddrs, size_t *sizes,
> > -  unsigned short *kinds)
> > +static void *
> > +gomp_target_unshare_firstprivate (size_t mapnum, void **hostaddrs,
> > + size_t *sizes, unsigned short *kinds)
> >  {
> >size_t i, tgt_align = 0, tgt_size = 0;
> >char *tgt = NULL;
> > @@ -1281,7 +1282,7 @@ gomp_target_fallback_firstprivate (void (*fn) (void 
> > *), size_t mapnum,
> >}
> >if (tgt_align)
> >  {
> > -  tgt = gomp_alloca (tgt_size + tgt_align - 1);
> > +  tgt = gomp_malloc (tgt_size + tgt_align - 1);
> 
> I don't like using gomp_malloc here, either copy/paste the function, or
> create separate inline functions for the two loops, one for the first loop
> which returns you tgt_align and tgt_size, and another for the stuff after
> the allocation.  Then you can use those two inline functions to implement
> both gomp_target_fallback_firstprivate which will use alloca, and
> gomp_target_unshare_firstprivate which will use gomp_malloc instead.

OK, I did that.

> 
> > @@ -1356,6 +1377,11 @@ GOMP_target (int device, void (*fn) (void *), const 
> > void *unused,
> > and several arguments have been added:
> > FLAGS is a bitmask, see GOMP_TARGET_FLAG_* in gomp-constants.h.
> > DEPEND is array of dependencies, see GOMP_task for details.
> > +   ARGS is a pointer to an array consisting of NUM_TEAMS, THREAD_LIMIT and 
> > a
> > +   variable number of device-specific arguments, which always take two 
> > elements
> > +   where the first specifies the type and the second the actual value.  The
> > +   last element of the array is a single NULL.
> 
> Note, here you document NUM_TEAMS and THREAD_LIMIT as special values, not
> encoded.

I have changed the comment but will remember to do it again if
necessary after changing omp-low.c

> 
> > @@ -1473,6 +1508,7 @@ GOMP_target_data (int device, const void *unused, 
> > size_t mapnum,
> >struct gomp_device_descr *devicep = resolve_device (device);
> >  
> >if (devicep == NULL
> > +  || (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
> >|| !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
> 
> Would be nice to have some consistency in the order of capabilities checks.
> Usually you check SHARED_MEM after OPENMP_400, so perhaps do it this way
> here too.

Sure.

> 
> > @@ -1741,23 +1784,38 @@ gomp_target_task_fn (void *data)
> >  
> >if (ttask->state == GOMP_TARGET_TASK_FINISHED)
> > {
> > - gomp_unmap_vars (ttask->tgt, true);
> > + if (ttask->tgt)
> > +   gomp_unmap_vars (ttask->tgt, true);
> >   return false;
> > }
> 
> This doesn't make sense.  For the GOMP_OFFLOAD_CAP_SHARED_MEM case, unless
> you want to run the free (ttask->firstprivate_copies); as a task, you
> shouldn't be queing the target task again for further execution, instead
> it should just be handled like a finished task at that point.  The reason
>

Re: [hsa 0/10] Merge of HSA branch

Hi,

On Mon, Dec 07, 2015 at 12:46:45PM +0100, Jakub Jelinek wrote:
> On Mon, Dec 07, 2015 at 12:17:58PM +0100, Martin Jambor wrote:
> > Because I have not been able to come up with any solution to failing
> > libgomp/testsuite/libgomp.c++/target-2.C, I have disabled use of
> > dynamic parallelism in this merge (I keep it on the branch) and
> > therefore entirely rely on the gridification process to run loops on
> > the accelerator, because gridified constructs do not have this issue
> > (passing private symbols by reference).
> 
> I'm fine with not doing it in this series, but I'd strongly prefer
> if dynamic parallelism is added for GCC 6.1.  Even for PTX we'll need
> some IPA analysis on what functions might run in the various OpenMP
> contexts (teams, parallel, simd) and what functions contains such
> directives, and let the backends (or HSA) do something based on that
> for sharing of the vars, or other properties of the function code
> generation.

6.1 seems a bit ambitious but I will try my best to bring it back as
soon as possible.

> 
> > HSA tests are still missing, I would need some guidance as to how to
> > best implement them (specially to test gridification which of course
> > does not happen for other accelerators).  There are no failing
> > testcases if HSA is not configured.  If it is, there are some, all of
> > which fall into one the following categories:
> > 
> >   1) HSA cannot compile a function for one reason or another (most
> >  common cause is inability of HSA to take an address of a function
> >  or make an indirect call) and gives a warning, which is regarded
> >  as an "excess error" by dejagnu.
> 
> It would be good if there is a -W* switch to turn such warnings off.
> Not just for the purposes of dejagnu libgomp testing, but say one
> might try to compile a program primarily say for XeonPhi or PTX offloading,
> but have HSA enabled to, but care primarily about the former two, etc.

All these warnings are in the -Whsa group and can be suppressed with
-Wno-hsa.

> 
> >   2) When HSA is not emitted for a function, libgomp runs a host
> >  fallback instead of it.  When the test queries
> >  omp_is_initial_device and asserts it returns false, the test
> >  fails.
> 
> Do you have examples of which tests fall into this category?

For example libgomp.c/examples-4/target-5.c.  HSA-runtime fails
"linking" the generated HSAIL because there is no function abort (we
really need to wait for HSA to implement traps to provide it) and so
the libgomp plugin decides to run the host fallback even when the
deja-gnu predicates would normally preclude it.  And the testcase
tests that omp_is_initial_device returns non-NULL.

> 
> In any case, it will be needed to also update the wiki page with details on
> how to build the HSA support in, what are the prerequisities etc.
> 

I have updated the gcc/README.hsa file on the branch with current
information.  The wiki page apparently requires a bit more editing
that I can accomplish today.  But I will add something similar there
there too.

Thanks,

Martin

[PATCH] Fix ICE with -fno-if-conversion (PR rtl-optimization/68730)

Hi!

It seems some passes in between the combiner and ira aren't prepared to
update dominance info.  It usually is not a problem, because already before
the combiner we call free_dominance_info.  But we now have a new i?86
stv pass that is injected after the combiner that computes dominators but
does not free them.

So, to fix ICE on the following testcase, we can either do what the patch
does, or could conditionalize both the calculate_dominance_info and
free_dominance_info in the convert_scalars_to_vector function (stv pass)
on the dominance info not being computed (like other places in gcc do),
or we could stick free_dominance_info into all passes that break the
dominators just in case it would be computed (out_of_cfglayout is one
example).

Bootstrapped/regtested on x86_64-linux and i686-linux, is this ok for trunk
(or some other variant is preferrable)?

2015-12-10  Jakub Jelinek  

PR rtl-optimization/68730
* config/i386/i386.c (convert_scalars_to_vector): Call
free_dominance_info at the end.

* gcc.dg/pr68730.c: New test.

--- gcc/config/i386/i386.c.jj   2015-12-09 14:39:02.0 +0100
+++ gcc/config/i386/i386.c  2015-12-10 12:15:59.517609392 +0100
@@ -3577,6 +3577,7 @@ convert_scalars_to_vector ()
   BITMAP_FREE (candidates);
   bitmap_obstack_release (NULL);
   df_process_deferred_rescans ();
+  free_dominance_info (CDI_DOMINATORS);
 
   /* Conversion means we may have 128bit register spills/fills
  which require aligned stack.  */
--- gcc/testsuite/gcc.dg/pr68730.c.jj   2015-12-10 12:22:07.330365019 +0100
+++ gcc/testsuite/gcc.dg/pr68730.c  2015-12-10 12:24:03.908702426 +0100
@@ -0,0 +1,51 @@
+/* PR rtl-optimization/68730 */
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-if-conversion" } */
+/* { dg-additional-options "-march=x86-64" { target { i?86-*-* x86_64-*-* } } 
} */
+
+int b, d, e;
+unsigned long long c = 4100543410106915;
+
+void
+foo (void)
+{
+  short f, g = 4 % c;
+  int h = c;
+  if (h)
+{
+  int i = ~c;
+  if (~c)
+   i = 25662;
+  f = g = i;
+  h = c - g + ~-f;
+  c = ~(c * h - f);
+}
+  f = g;
+  unsigned long long k = g || c;
+  short l = c ^ g ^ k;
+  if (g > 25662 || c == 74074520320 || !(g < 2))
+{
+  k = c;
+  l = g;
+  c = ~((k && c) + ~l);
+  f = ~(f * (c ^ k) | l);
+  if (c > k)
+   __builtin_printf ("%d\n", f);
+}
+  short m = -f;
+  unsigned long long n = c;
+  c = m * f | n % c;
+  if (n)
+__builtin_printf ("%d\n", f);
+  while (f < -31807)
+;
+  c = ~(n | c) | f;
+  if (n < c)
+__builtin_printf ("%lld\n", (long long) f);
+  for (; d;)
+for (; e;)
+  for (;;)
+   ;
+  c = h;
+  c = l % c;
+}

Jakub

Re: Request permission to delete gcc.dg/vect/costmodel/ppc/costmodel-fast-math-vect-pr29925.c

2015-12-10 Thread David Edelsohn

On Thu, Dec 10, 2015 at 2:23 PM, Bill Schmidt
 wrote:
> Hi,
>
> The subject test case has been failing as follows:
>
> FAIL: gcc.dg/vect/costmodel/ppc/costmodel-fast-math-vect-pr29925.c 
> scan-tree-dump-times vect "vectorization not profitable" 1
>
> The test has been failing since r223528, which is:
>
> 2015-05-22  Richard Biener  
>
> PR tree-optimization/65701
> * tree-vect-data-refs.c (vect_enhance_data_refs_alignment):
> Move peeling cost models into one place.  Peel for alignment
> for single loads only if an aligned load is cheaper than
> an unaligned load.
>
> Thus with that modification, gcc now vectorizes the loop that was
> previously deemed unprofitable to vectorize.  As a result, the test case
> no longer has any reason to exist, and I would like to delete it.
>
> Ok for trunk?
>
> Thanks,
> Bill
>
>
> [gcc/testsuite]
>
> 2015-12-10  Bill Schmidt  
>
> * gcc.dg/vect/costmodel/ppc/costmodel-fast-math-vect-pr29925.c:
> Delete.

Okay with me.

Thanks, David

Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation


On 12/03/2015 07:45 AM, Richard Biener wrote:



Ah, some EEMBC one.

Btw, the testcase that was added shows

if (xc < xm)
  {
xk = (unsigned char) (xc < xy ? xc : xy);
  }
else
 {
   xk = (unsigned char) (xm < xy ? xm : xy);
 }

which might be better handled by phiopt transforming it into
I don't think the included testcase is a particularly good one for this 
transformation -- I didn't see that the transformation made any 
significant difference on x86_64.  That why I asked for Ajit for more 
data on the benchmarking.





xk = MIN (xc, MIN (xm, xy))

phiopt1 sees (hooray to GENERIC folding)

   xc_26 = ~xr_21;
   xm_27 = ~xg_23;
   xy_28 = ~xb_25;
   if (xr_21 > xg_23)
 goto ;
   else
 goto ;

   :
   xk_29 = MIN_EXPR ;
   goto ;

   :
   xk_30 = MIN_EXPR ;

   :
   # xk_4 = PHI 

btw, see PR67438 for a similar testcase and the above pattern.
That may be elsewhere in BZ database as well.  I've seen stuff that 
looks awful close to that when going through the bug lists in prior 
releases.


jeff

Request permission to delete gcc.dg/vect/costmodel/ppc/costmodel-fast-math-vect-pr29925.c

2015-12-10 Thread Bill Schmidt

Hi,

The subject test case has been failing as follows:

FAIL: gcc.dg/vect/costmodel/ppc/costmodel-fast-math-vect-pr29925.c 
scan-tree-dump-times vect "vectorization not profitable" 1

The test has been failing since r223528, which is:

2015-05-22  Richard Biener  

PR tree-optimization/65701
* tree-vect-data-refs.c (vect_enhance_data_refs_alignment):
Move peeling cost models into one place.  Peel for alignment
for single loads only if an aligned load is cheaper than
an unaligned load.

Thus with that modification, gcc now vectorizes the loop that was
previously deemed unprofitable to vectorize.  As a result, the test case
no longer has any reason to exist, and I would like to delete it.

Ok for trunk?

Thanks,
Bill


[gcc/testsuite]

2015-12-10  Bill Schmidt  

* gcc.dg/vect/costmodel/ppc/costmodel-fast-math-vect-pr29925.c:
Delete.

Re: [testsuite][ARM target attributes] Fix effective_target tests

2015-12-10 Thread Christophe Lyon

On 10 December 2015 at 14:14, Kyrill Tkachov  wrote:
>
> On 10/12/15 13:04, Christophe Lyon wrote:
>>
>> On 10 December 2015 at 13:30, Kyrill Tkachov 
>> wrote:
>>>
>>> Hi Christophe,
>>>
>>>
>>> On 08/12/15 11:18, Christophe Lyon wrote:

 On 8 December 2015 at 11:50, Kyrill Tkachov 
 wrote:
>
> Hi Christophe,
>
>
> On 27/11/15 13:00, Christophe Lyon wrote:
>>
>> Hi,
>>
>> After the recent commits from Christian adding target attributes
>> support for ARM FPU settings,  I've noticed that some of the tests
>> were failing because of incorrect assumptions wrt to the default
>> cpu/fpu/float-abi of the compiler.
>>
>> This patch fixes the problems I've noticed in the following way:
>> - do not force -mfloat-abi=softfp in dg-options, to avoid conflicts
>> when gcc is configured --with-float=hard
>>
>> - change arm_vfp_ok such that it tries several -mfpu/-mfloat-abi
>> flags, checks that __ARM_FP is defined and __ARM_NEON_FP is not
>> defined
>>
>> - introduce arm_fp_ok, which is similar but does not enforce fpu
>> setting
>>
>> - add a new effective_target: arm_crypto_pragma_ok to check that
>> setting this fpu via a pragma is actually supported by the current
>> "multilib". This is different from checking the command-line option
>> because the pragma might conflict with the command-line options in
>> use.
>>
>> The updates in the testcases are as follows:
>> - attr-crypto.c, we have to make sure that the defaut fpu does not
>> conflict with the one forced by pragma. That's why I use the arm_vfp
>> options/effective_target. This is needed if gcc has been configured
>> --with-fpu=neon-fp16, as the pragma fpu=crypto-neon-fp-armv8 would
>> conflict.
>>
>> - attr-neon-builtin-fail.c: use arm_fp to force the appropriate
>> float-abi setting. Enforcing fpu is not needed here.
>>
>> - attr-neon-fp16.c: similar, I also removed arm_neon_ok since it was
>> not necessary to make the test pass in my testing. On second thought,
>> I'm wondering whether I should leave it and make the test unsupported
>> in more cases (such as when forcing -march=armv5t, although it does
>> pass with this patch)
>>
>> - attr-neon2.c: use arm_vfp to force the appropriate float-abi
>> setting. Enforcing mfpu=vfp is needed to avoid conflict with the
>> pragma target fpu=neon (for instance if the toolchain default is
>> neon-fp16)
>>
>> - attr-neon3.c: similar
>>
>> Tested on a variety of configurations, see:
>>
>>
>>
>> http://people.linaro.org/~christophe.lyon/cross-validation/gcc-test-patches/230929-target-attr/report-build-info.html
>>
>> Note that the regressions reported fall into 3 categories:
>> - when forcing march=armv5t: tests are now unsupported because I
>> modified arm_crypto_ok to require arm_v8_neon_ok instead of arm32.
>>
>> - the warning reported by attr-neon-builtin-fail.c moved from line 12
>> to 14 and is thus seen as a regression + one improvement
>>
>> - finally, attr-neon-fp16.c causes an ICE on armeb compilers, for
>> which I need to post a bugzilla.
>>
>>
>> TBH, I'm a bit concerned by the complexity of all these multilib-like
>> conditions. I'm confident that I'm still missing some combinations :-)
>>
>> And with new target attributes coming, new architectures etc... all
>> this logic is likely to become even more complex.
>>
>> That being said, OK for trunk?
>
>
> I'd like us to clean up and reorganise the gcc.target/arm testsuite
> at some point to make it more robust with respect to the tons of
> multilib
> options and configurations we can have for arm. It's becoming very
> frustrating
> to test feature-specific stuff :(
>
> This is ok in the meantime.
> Sorry for the delay.
>
 Thanks for the approval, glad to see I'm not the only one willing to see
 improvements in this area :)

 Committed as r231403.
>>>
>>>
>>> With this patch we're seeing legitimate PASSes go to NA.
>>> For example:
>>>
>>> gcc.target/arm/vfp-1.c
>>> gcc.target/arm/vfp-ldmdbs.c
>>> and other vfp tests in arm.exp.
>>> This is, for example, for the
>>> variant-mthumb/-march=armv8-a/-mfpu=crypto-neon-fp-armv8/-mfloat-abi=hard
>>>
>> Hmm I'm attempting to cover such a configuration in my matrix of
>> validations:
>>
>> http://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/231403/report-build-info.html
>>
>> The difference is that I use similar flags at GCC configure time, while
>> you
>> override them when running the testsuite:
>> --target arm-none-linux-gnueabihf --with-mode=thum --with-cpu=cortex-a57
>> --with-fpu=crypto-neon-fp-armv8
>>
>> I this case, I do not see the regressions.

[PTX] reorder function calling hooks

2015-12-10 Thread Nathan Sidwell

This patch reorders the function calling hooks so they come before 
write_one_arg.  There's no change to their functionality, but it will allow them 
to be called from write_one_arg and write_result, rather than have those 
functions continue to independently perform essentially the same calculation. 
Also, I split write_one_arg into a write_one_arg helper, (which does write one 
arg), and a write_arg  main entry point (which writes one or more PTX args).


nathan
2015-12-10  Nathan Sidwell  

	* config/nvptx/nvptx.c (nvptx_function_arg,
	nvptx_function_incoming_arg, nvptx_function_arg_advance,
	nvptx_strict_argument_naming, nvptx_function_arg_boundary,
	nvptx_libcall_value, nvptx_function_value,
	nvptx_function_value_regno_p, nvptx_pass_by_reference,
	nvptx_return_in_memory, nvptx_promote_function_mode,
	nvptx_static_chain): Move earlier.
	(write_one_arg): Break out as helper fn for ...
	(write_arg): ... this new function.  Adjust all callers.

Index: config/nvptx/nvptx.c
===
--- config/nvptx/nvptx.c	(revision 231542)
+++ config/nvptx/nvptx.c	(working copy)
@@ -389,69 +389,270 @@ arg_promotion (machine_mode mode)
   return mode;
 }
 
-/* Process function parameter TYPE, either emitting in a prototype
-   argument, or as a copy a in a function prologue.  ARGNO is the
-   index of this argument in the PTX function.  FOR_REG is negative,
-   if we're emitting the PTX prototype.  It is zero if we're copying
-   to an argument register and it is greater than zero if we're
-   copying to a specific hard register.  PROTOTYPED is true, if this
-   is a prototyped function, rather than an old-style C declaration.
+/* Implement TARGET_FUNCTION_ARG.  */
 
-   The behaviour here must match the regular GCC function parameter
-   marshalling machinery.  */
+static rtx
+nvptx_function_arg (cumulative_args_t, machine_mode mode,
+		const_tree, bool named)
+{
+  if (mode == VOIDmode)
+return NULL_RTX;
 
-static int
-write_one_arg (std::stringstream , int for_reg, int argno,
-	   tree type, bool prototyped)
+  if (named)
+return gen_reg_rtx (mode);
+  return NULL_RTX;
+}
+
+/* Implement TARGET_FUNCTION_INCOMING_ARG.  */
+
+static rtx
+nvptx_function_incoming_arg (cumulative_args_t cum_v, machine_mode mode,
+			 const_tree, bool named)
 {
-  machine_mode mode = TYPE_MODE (type);
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  if (mode == VOIDmode)
+return NULL_RTX;
 
-  if (!PASS_IN_REG_P (mode, type))
-mode = Pmode;
+  if (!named)
+return NULL_RTX;
 
-  machine_mode split = maybe_split_mode (mode);
-  if (split != VOIDmode)
+  /* No need to deal with split modes here, the only case that can
+ happen is complex modes and those are dealt with by
+ TARGET_SPLIT_COMPLEX_ARG.  */
+  return gen_rtx_UNSPEC (mode,
+			 gen_rtvec (1, GEN_INT (cum->count)),
+			 UNSPEC_ARG_REG);
+}
+
+/* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
+
+static void
+nvptx_function_arg_advance (cumulative_args_t cum_v,
+			machine_mode ARG_UNUSED (mode),
+			const_tree ARG_UNUSED (type),
+			bool ARG_UNUSED (named))
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  cum->count++;
+}
+
+/* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
+
+   For nvptx, we know how to handle functions declared as stdarg: by
+   passing an extra pointer to the unnamed arguments.  However, the
+   Fortran frontend can produce a different situation, where a
+   function pointer is declared with no arguments, but the actual
+   function and calls to it take more arguments.  In that case, we
+   want to ensure the call matches the definition of the function.  */
+
+static bool
+nvptx_strict_argument_naming (cumulative_args_t cum_v)
+{
+  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
+  return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
+}
+
+/* Implement TARGET_FUNCTION_ARG_BOUNDARY.  */
+
+static unsigned int
+nvptx_function_arg_boundary (machine_mode mode, const_tree type)
+{
+  unsigned int boundary = type ? TYPE_ALIGN (type) : GET_MODE_BITSIZE (mode);
+
+  if (boundary > BITS_PER_WORD)
+return 2 * BITS_PER_WORD;
+
+  if (mode == BLKmode)
 {
-  mode = split;
-  argno = write_one_arg (s, for_reg, argno,
-			 TREE_TYPE (type), prototyped);
+  HOST_WIDE_INT size = int_size_in_bytes (type);
+  if (size > 4)
+return 2 * BITS_PER_WORD;
+  if (boundary < BITS_PER_WORD)
+{
+  if (size >= 3)
+return BITS_PER_WORD;
+  if (size >= 2)
+return 2 * BITS_PER_UNIT;
+}
 }
+  return boundary;
+}
 
-  if (!prototyped && !AGGREGATE_TYPE_P (type))
+/* Implement TARGET_LIBCALL_VALUE.  */
+
+static rtx
+nvptx_libcall_value (machine_mode mode, const_rtx)
+{
+  if (cfun->machine->start_call == NULL_RTX)
+/* Pretend to return in a hard reg for early uses before pseudos can be
+   generated.  */
+return gen_rtx_REG (mode,

Re: [Patch,tree-optimization]: Add new path Splitting pass on tree ssa representation


On 12/03/2015 07:38 AM, Richard Biener wrote:

This pass is now enabled by default with -Os but has no limits on the amount of
stmts it copies.
The more statements it copies, the more likely it is that the path 
spitting will turn out to be useful!  It's counter-intuitive.


The primary benefit AFAICT with path splitting is that it exposes 
additional CSE, DCE, etc opportunities.


IIRC  Ajit posited that it could help with live/conflict analysis, I 
never saw that, and with the changes to push splitting deeper into the 
pipeline I'd further life/conflict analysis since that work also 
involved preserving the single latch property.




 It also will make all loops with this shape have at least two

exits (if the resulting loop will be disambiguated the inner loop will
have two exits).
Having more than one exit will disable almost all loop optimizations after it.
Hmmm, the updated code keeps the single latch property, but I'm pretty 
sure it won't keep a single exit policy.


To keep a single exit policy would require keeping an additional block 
around.  Each of the split paths would unconditionally transfer to this 
new block.  The new block would then either transfer to the latch block 
or out of the loop.





The pass itself documents the transform it does but does zero to motivate it.

What's the benefit of this pass (apart from disrupting further optimizations)?
It's essentially building superblocks in a special case to enable 
additional CSE, DCE and the like.


Unfortunately what is is missing is heuristics and de-duplication.  The 
former to drive cases when it's not useful and the latter to reduce 
codesize for any statements that did not participate in optimizations 
when they were duplicated.


The de-duplication is the "sink-statements-through-phi" problems, cross 
jumping, tail merging and the like class of problems.


It was only after I approved this code after twiddling it for Ajit that 
I came across Honza's tracer implementation, which may in fact be 
retargettable to these loops and do a better job.  I haven't 
experimented with that.






I can see a _single_ case where duplicating the latch will allow threading
one of the paths through the loop header to eliminate the original exit.  Then
disambiguation may create a nice nested loop out of this.  Of course that
is only profitable again if you know the remaining single exit of the inner
loop (exiting to the outer one) is executed infrequently (thus the inner loop
actually loops).

It wasn't ever about threading.



But no checks other than on the CFG shape exist (oh, it checks it will
at _least_ copy two stmts!).
Again, the more statements it copies the more likely it is to be 
profitable.  Think superblocks to expose CSE, DCE and the like.




Given the profitability constraints above (well, correct me if I am
wrong on these)
it looks like the whole transform should be done within the FSM threading
code which might be able to compute whether there will be an inner loop
with a single exit only.
While it shares some concepts with jump threading, I don't think the 
transformation belongs in jump threading.




I'm inclined to request the pass to be removed again or at least disabled by
default.
I wouldn't lose any sleep if we disabled by default or removed, 
particularly if we can repurpose Honza's code.  In fact, I might 
strongly support the former until we hear back from Ajit on performance 
data.


I also keep coming back to Click's paper on code motion -- in that 
context, copying statements would be a way to break dependencies and 
give the global code motion algorithm more freedom.  The advantage of 
doing it in a framework like Click's is it's got a built-in sinking step.





What closed source benchmark was this transform invented for?
I think it was EEMBC or Coremark.  Ajit should know for sure.  I was 
actually still hoping to see benchmark results from Ajit to confirm the 
new placement in the pipeline didn't negate all the benefits he saw.


jeff

Re: Free up bits in DECLs and TYPEs


On 12/10/2015 04:04 PM, Michael Matz wrote:

This isn't stage 3 material really, OTOH fairly low risk.  Anyway, okay
for trunk now or once stage 1 opens?


This is cool and we want it, but not now. Ok for stage 1, with the 
formatting problems quoted below fixed.



Bernd


+#define TYPE_ALIGN(NODE) \
+(TYPE_CHECK (NODE)->type_common.align \
+   ? ((unsigned)1) << ((NODE)->type_common.align - 1) \
+   : 0)



+#define DECL_ALIGN(NODE) \
+(DECL_COMMON_CHECK (NODE)->decl_common.align \
+   ? ((unsigned)1) << ((NODE)->decl_common.align - 1) \
+   : 0)



  #ifdef BIGGEST_FIELD_ALIGNMENT
- DECL_ALIGN (decl)
-   = MIN (DECL_ALIGN (decl), (unsigned) BIGGEST_FIELD_ALIGNMENT);
+ SET_DECL_ALIGN (decl,
+ MIN (DECL_ALIGN (decl), (unsigned) BIGGEST_FIELD_ALIGNMENT));



+  SET_DECL_ALIGN (fndecl,
+FUNCTION_BOUNDARY_P (TREE_TARGET_OPTION (callee_tree)->x_target_flags));

Re: [Fortran, Patch] Memory sync after coarray image control statements and assignment

2015-12-10 Thread Alessandro Fanfarillo

2015-12-09 23:16 GMT+01:00 Tobias Burnus :
> Thanks. Committed as r231476.

Thanks.

>
> Do we need to do anything about GCC 5 or is this only a GCC 6 issue?
>

Yes, the patch should be applied to GCC 5 too.

> That can be changed: Simply fill out the form and list me (burnus (at]
> gcc.gnu.org) as sponsor: https://sourceware.org/cgi-bin/pdw/ps_form.cgi –
> and see https://gcc.gnu.org/svnwrite.html

Done. Thanks.

>
> Tobias

[PATCH] Fix PR68721


This fixes a wrong-code case after my IPA split fix (with PR68672 still
unfixed).

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2015-12-10  Richard Biener  

PR ipa/68721
* ipa-split.c (split_function): Record return value properly
when the split part doesn't set it.

* gcc.dg/torture/pr68721.c: New testcase.

Index: gcc/ipa-split.c
===
--- gcc/ipa-split.c (revision 231396)
+++ gcc/ipa-split.c (working copy)
@@ -1281,7 +1281,7 @@ split_function (basic_block return_bb, s
  to return void instead of just outputting function with undefined return
  value.  For structures this affects quality of codegen.  */
   else if (!split_point->split_part_set_retval
-   && find_retval (return_bb))
+   && (retval = find_retval (return_bb)))
 {
   bool redirected = true;
   basic_block new_return_bb = create_basic_block (NULL, 0, return_bb);
@@ -1305,6 +1305,7 @@ split_function (basic_block return_bb, s
   e->count = new_return_bb->count;
   add_bb_to_loop (new_return_bb, current_loops->tree_root);
   bitmap_set_bit (split_point->split_bbs, new_return_bb->index);
+  retbnd = find_retbnd (return_bb);
 }
   /* When we pass around the value, use existing return block.  */
   else
Index: gcc/testsuite/gcc.dg/torture/pr68721.c
===
--- gcc/testsuite/gcc.dg/torture/pr68721.c  (revision 0)
+++ gcc/testsuite/gcc.dg/torture/pr68721.c  (working copy)
@@ -0,0 +1,31 @@
+/* { dg-do run } */
+
+extern void abort (void);
+
+int a, b, c, *d, **e = 
+
+int *
+fn1 ()
+{
+  for (;;)
+{
+  for (; a;)
+   if (b)
+ abort ();
+  break;
+}
+  for (; c;)
+;
+  return 
+}
+
+int
+main ()
+{
+  *e = fn1 ();
+
+  if (!d)
+abort ();
+
+  return 0;
+}

Re: [PATCH, ARM] PR68674 Fix LTO support for neon builtins and error catching



On 10/12/15 09:26, Christian Bruel wrote:

Hi Kyrill,

On 12/09/2015 06:32 PM, Kyrill Tkachov wrote:

Hi Christian,

On 08/12/15 12:53, Christian Bruel wrote:

Hi,

The order of the NEON builtins construction has led to complications since the 
attribute target support. This was not a problem when driven from the command 
line, but was causing various issues when the builtins was mixed between fpu
configurations or when used with LTO.

Firstly the builtin functions was not initialized before the parsing of 
functions, leading to wrong type initializations.

Then error catching code when a builtin was used without the proper fpu flags 
was incomprehensible for the user, for instance

#include "arm_neon.h"

int8x8_t a, b;
int16x8_t e;

void
main()
{
   e = (int16x8_t)__builtin_neon_vaddlsv8qi (a, b);
}

compiled with default options (without -mfpu=neon -mfloat-abi=hard) gave pages 
of

/arm-none-eabi/6.0.0/include/arm_neon.h:39:9: error: unknown type name 
'__simd64_int8_t'
  typedef __simd64_int8_t int8x8_t;
...
...
arm_neon.h:4724:3: error: can't convert a vector of type 'poly64x2_t {aka 
__vector(4) int}' to type 'int' which has different size
return (poly64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) 
__b, __c);
^~
...
... and one for each arm_neon.h lines..

by postponing the check into arm_expand_builtin, we now emit something more 
useful:

testo.c: In function 'main':
testo.c:9:7: error: '__builtin_neon_vaddlsv8qi' neon builtin is not supported 
in this configuration.
e = (int16x8_t)__builtin_neon_vaddlsv8qi (a, b);
^~~

One small side effect to note: The total memory allocated is 370k bigger when 
neon is not used, so this support will have a follow-up to make their 
initialization lazy. But I'd like first to stabilize the stuff for stage3 (or 
get it
pre-approved if the memory is an issue)

tested without new failures with {,-mfpu=vfp,-mfpu=neon}{,-march=armv7-a\}
(a few tests that was fail are now unsupported)



I agree, the vector types (re)initialisation is a tricky part.
I've seen similar issues in the aarch64 work for target attributes

   bool
   arm_vector_mode_supported_p (machine_mode mode)
   {
-  /* Neon also supports V2SImode, etc. listed in the clause below.  */
-  if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
+  if (mode == V2SFmode || mode == V4SImode || mode == V8HImode
 || mode == V4HFmode || mode == V16QImode || mode == V4SFmode
-  || mode == V2DImode || mode == V8HFmode))
-return true;
-
-  if ((TARGET_NEON || TARGET_IWMMXT)
-  && ((mode == V2SImode)
-  || (mode == V4HImode)
-  || (mode == V8QImode)))
+  || mode == V2DImode || mode == V8HFmode
+  || mode == V2SImode || mode == V4HImode || mode == V8QImode)
   return true;


So this allows vector modes unconditionally for all targets/fpu configurations?
I was tempted to do that in aarch64 when I was encountering similar issues.
In the end what worked for me was re-laying out the vector types in 
SET_CURRENT_FUNCTION
if necessary (https://gcc.gnu.org/ml/gcc-patches/2015-08/msg01084.html)


yes my assumption was that arm_init_neon_builtins () is now called for all targets, since the check is done at expand time and that the builtins need to be known by lto, with the vector type initialization, before they are expanded. 
However at that time, lto streaming-in have not yet processed the attributes and TARGET_NEON is not set for the function.


I had a look at your re-layout, but I'm not sure. it feels like a hack. I think this should be solved first place during the builtin construction. Also set_current_function is too late, builtin_expand that will explode because of the 
unknown modes.


But raise the point. In fact I was not really happy with this 
arm_vector_mode_supported_p neither as I was not sure about other contexts it 
can be called from and I cannot clearly claim that this change is always 
correct.



So the main usage of targetm.vector_mode_supported_p is in stor-layout.c and 
vector_type_mode in particular seems
to have a relevant comment:
 /* Vector types need to re-check the target flags each time we report
the machine mode.  We need to do this because attribute target can
change the result of vector_mode_supported_p and have_regs_of_mode
on a per-function basis.  Thus the TYPE_MODE of a VECTOR_TYPE can
change on a per-function basis.  */

I think that implies that it expects targetm.vector_mode_supported_p to reject 
vector modes in
contexts that don't support NEON...


I'd like to think about other way to set the vector modes from 
arm_init_neon_builtins before the target flags are known. I'm thinking about 
the lazy initialization at expand time, or using a contextual boolean flags. 
how does that sound ?



Laying out the vector types during arm_init_neon_builtins sounds more promising 
to me.
Changing layout of types during expand is risky, from

Re: [PATCH] Fix up fold_ctor_reference and fully_constant_vn_reference_p (PR tree-optimization/68785)

On Wed, 9 Dec 2015, Jakub Jelinek wrote:

> Hi!
> 
> On a testcase like below which would trigger UB at runtime we trigger
> UB in the compiler, by reading uninitialized bytes.
> 
> The VCE folding for which native_{encode,interpret}_expr has been originally
> written passes the length from the first one to the second one, so that
> the latter can return NULL_TREE (not fold) if not enough bytes in the buffer
> were filled.  I believe this is the shortest fix for this issue and makes
> the code consistent with what is used in VCE folding.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Ok.

Richard.

> 2015-12-09  Jakub Jelinek  
> 
>   PR tree-optimization/68785
>   * gimple-fold.c (fold_ctor_reference): Pass return value from
>   native_encode_expr to native_interpret_expr.
>   * tree-ssa-sccvn.c (fully_constant_vn_reference_p): Likewise.
> 
>   * gcc.dg/pr68785.c: New test.
> 
> --- gcc/gimple-fold.c.jj  2015-11-24 11:43:35.0 +0100
> +++ gcc/gimple-fold.c 2015-12-09 10:48:06.824975709 +0100
> @@ -5495,9 +5495,10 @@ fold_ctor_reference (tree type, tree cto
>&& size <= MAX_BITSIZE_MODE_ANY_MODE)
>  {
>unsigned char buf[MAX_BITSIZE_MODE_ANY_MODE / BITS_PER_UNIT];
> -  if (native_encode_expr (ctor, buf, size / BITS_PER_UNIT,
> -   offset / BITS_PER_UNIT) > 0)
> - return native_interpret_expr (type, buf, size / BITS_PER_UNIT);
> +  int len = native_encode_expr (ctor, buf, size / BITS_PER_UNIT,
> + offset / BITS_PER_UNIT);
> +  if (len > 0)
> + return native_interpret_expr (type, buf, len);
>  }
>if (TREE_CODE (ctor) == CONSTRUCTOR)
>  {
> --- gcc/tree-ssa-sccvn.c.jj   2015-12-04 17:19:12.0 +0100
> +++ gcc/tree-ssa-sccvn.c  2015-12-09 10:50:30.329960789 +0100
> @@ -1370,8 +1370,9 @@ fully_constant_vn_reference_p (vn_refere
> else
>   {
> unsigned char buf[MAX_BITSIZE_MODE_ANY_MODE / BITS_PER_UNIT];
> -   if (native_encode_expr (ctor, buf, size, off) > 0)
> - return native_interpret_expr (ref->type, buf, size);
> +   int len = native_encode_expr (ctor, buf, size, off);
> +   if (len > 0)
> + return native_interpret_expr (ref->type, buf, len);
>   }
>   }
>  }
> --- gcc/testsuite/gcc.dg/pr68785.c.jj 2015-12-09 10:52:00.232698487 +0100
> +++ gcc/testsuite/gcc.dg/pr68785.c2015-12-09 10:50:54.0 +0100
> @@ -0,0 +1,9 @@
> +/* PR tree-optimization/68785 */
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +int
> +foo (void)
> +{
> +  return *(int *) "";
> +}
> 
>   Jakub
> 
> 

-- 
Richard Biener 
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)

[committed, obvious] Remove invalid assert in find_func_aliases_for_builtin_call

2015-12-10 Thread Tom de Vries

[ was : Re: [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in 
find_func_clobbers ]


On 09/12/15 11:01, Tom de Vries wrote:

[ was: Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta ]

On 30/11/15 13:11, Tom de Vries wrote:

On 30/11/15 10:16, Richard Biener wrote:

On Mon, 30 Nov 2015, Tom de Vries wrote:


Hi,

this patch fixes PR46032.

It handles a call:
...
   __builtin_GOMP_parallel (fn, data, num_threads, flags)
...
as:
...
   fn (data)
...
in ipa-pta.

This improves ipa-pta alias analysis in the parallelized function fn,
and
allows vectorization in the testcase without a runtime alias test.

Bootstrapped and reg-tested on x86_64.

OK for stage3 trunk?


+ /* Assign the passed argument to the appropriate incoming
+parameter of the function.  */
+ struct constraint_expr lhs ;
+ lhs = get_function_part_constraint (fi, fi_parm_base + 0);
+ auto_vec rhsc;
+ struct constraint_expr *rhsp;
+ get_constraint_for_rhs (arg, );
+ while (rhsc.length () != 0)
+   {
+ rhsp =  ();
+ process_constraint (new_constraint (lhs, *rhsp));
+ rhsc.pop ();
+   }

please use style used elsewhere with

  FOR_EACH_VEC_ELT (rhsc, j, rhsp)
process_constraint (new_constraint (lhs, *rhsp));
  rhsc.truncate (0);



That code was copied from find_func_aliases_for_call.
I've factored out the bit that I copied as
find_func_aliases_for_call_arg, and fixed the style there (and dropped
'rhsc.truncate (0)' since AFAIU it's redundant at the end of a function).


+ /* Parameter passed by value is used.  */
+ lhs = get_function_part_constraint (fi, fi_uses);
+ struct constraint_expr *rhsp;
+ get_constraint_for_address_of (arg, );

This isn't correct - you want to use get_constraint_for (arg, ).
After all rhs is already an ADDR_EXPR.



Can we add an assert somewhere to detect this incorrect usage?


+ FOR_EACH_VEC_ELT (rhsc, j, rhsp)
+   process_constraint (new_constraint (lhs, *rhsp));
+ rhsc.truncate (0);
+
+ /* The caller clobbers what the callee does.  */
+ lhs = get_function_part_constraint (fi, fi_clobbers);
+ rhs = get_function_part_constraint (cfi, fi_clobbers);
+ process_constraint (new_constraint (lhs, rhs));
+
+ /* The caller uses what the callee does.  */
+ lhs = get_function_part_constraint (fi, fi_uses);
+ rhs = get_function_part_constraint (cfi, fi_uses);
+ process_constraint (new_constraint (lhs, rhs));

I don't see why you need those.  The solver should compute these
in even better precision (context sensitive on the call side).

The same is true for the function parameter.  That is, the only
needed part of the patch should be that making sure we see
the "direct" call and assign parameters correctly.



Dropped this bit.


Dropping the find_func_clobbers bit (in particular, the part copying the
clobbers) caused PR68716.

Basically the find_func_clobbers bit tries to emulate the generic
handling of calls in find_func_clobbers, but pretends the call is
   fn (data)
when handling
   __builtin_GOMP_parallel (fn, data, num_threads, flags)
.

I used the same comments as in the generic call handling to make it
clear what part of the generic call handling is emulated.

Compared to the originally posted patch, the changes are:
- removed 'gcc_assert (TREE_CODE (arg) == ADDR_EXPR)'. Ran into a case
   where arg is NULL.


And for that same reason, I've removed the same assert from 
find_func_aliases_for_builtin_call.


Committed to trunk as obvious.

Thanks,
- Tom
Remove invalid assert in find_func_aliases_for_builtin_call

2015-12-10  Tom de Vries  

	* tree-ssa-structalias.c (find_func_aliases_for_builtin_call): Remove
	invalid assert.

---
 gcc/tree-ssa-structalias.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index 15e351e..c350862 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -4533,7 +4533,6 @@ find_func_aliases_for_builtin_call (struct function *fn, gcall *t)
 	  gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
 	  tree fndecl = TREE_OPERAND (fnarg, 0);
 	  tree arg = gimple_call_arg (t, argpos);
-	  gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
 
 	  varinfo_t fi = get_vi_for_tree (fndecl);
 	  find_func_aliases_for_call_arg (fi, 0, arg);

[PATCH] Fix wrong-code with IPA PTA


This fixes a missing constraint for by-reference DECL_RESULT in
nonlocal_p mode.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2015-12-10  Richard Biener  

* tree-ssa-structalias.c (create_function_info_for): Add missing
constraint from nonlocal for DECL_RESULT.

Index: gcc/tree-ssa-structalias.c
===
--- gcc/tree-ssa-structalias.c  (revision 231494)
+++ gcc/tree-ssa-structalias.c  (working copy)
@@ -5631,6 +5630,11 @@ create_function_info_for (tree decl, con
   if (DECL_RESULT (decl))
insert_vi_for_tree (DECL_RESULT (decl), resultvi);
 
+  if (nonlocal_p
+ && DECL_RESULT (decl)
+ && DECL_BY_REFERENCE (DECL_RESULT (decl)))
+   make_constraint_from (resultvi, nonlocal_id);
+
   gcc_assert (prev_vi->offset < resultvi->offset);
   prev_vi->next = resultvi->id;
   prev_vi = resultvi;

[PATCH] Make tree if-conversion honor PARAM_ALLOW_STORE_DATA_RACES


The following makes us honor PARAM_ALLOW_STORE_DATA_RACES instead of
making -ftree-loop-if-convert-stores an if-conversion specific "alias"
of that.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

I've also verified it passes SPEC CPU 2006 on x86_64.

Richard.

2015-12-10  Richard Biener  

* tree-if-conv.c: Include params.h.
(ifcvt_memrefs_wont_trap): Use PARAM_ALLOW_STORE_DATA_RACES
instead of flag_tree_loop_if_convert_stores to guard cases
we'd introduce store-data-races.

Index: gcc/tree-if-conv.c
===
--- gcc/tree-if-conv.c  (revision 231493)
+++ gcc/tree-if-conv.c  (working copy)
@@ -112,6 +112,7 @@ along with GCC; see the file COPYING3.
 #include "tree-hash-traits.h"
 #include "varasm.h"
 #include "builtins.h"
+#include "params.h"
 
 /* List of basic blocks in if-conversion-suitable order.  */
 static basic_block *ifc_bbs;
@@ -714,7 +715,7 @@ ifcvt_memrefs_wont_trap (gimple *stmt, v
  to unconditionally.  */
   if (base_master_dr
  && DR_BASE_W_UNCONDITIONALLY (*base_master_dr))
-   return flag_tree_loop_if_convert_stores;
+   return PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES);
   else
{
  /* or the base is know to be not readonly.  */
@@ -722,7 +723,7 @@ ifcvt_memrefs_wont_trap (gimple *stmt, v
  if (DECL_P (base_tree)
  && decl_binds_to_current_def_p (base_tree)
  && ! TREE_READONLY (base_tree))
-   return flag_tree_loop_if_convert_stores;
+   return PARAM_VALUE (PARAM_ALLOW_STORE_DATA_RACES);
}
 }
   return false;

[PATCH] Fix IPA PTA vs. PTA regressions


The following patch takes a stab at fixing the most obvious regression
of IPA PTA vs. PTA - dumbing down of pt_solution_includes_global
with

-   /* ???  This predicate is not correct for the IPA-PTA solution
-  as we do not properly distinguish between unit escape points
-  and global variables.  */
-   if (cfun->gimple_df->ipa_pta)
- return true;

by making the semantics of pt_solution->vars_contains_nonlocal
(or what composes the NONLOCAL solution) not depend on
the PTA mode (IPA or not).

Optimization-wise this fixes missing DSE and overly conservative
IPA pure-const discovery (only in its less important late invocation).

Note that using IPA PTA still causes optimization regressions because
it does not handle restrict nor the heapvar escape trick and its
escape logic is still overly conservative (probably the biggest issue).

Anyway - the following patch passed bootstrap with -fipa-pta and
regtest on x86_64-unknown-linux-gnu.

Richard.

2015-12-10  Richard Biener  

PR ipa/68331
* tree-ssa-structalias.c (set_uids_in_ptset): Add fndecl
parameter and make vars_contains_nonlocal properly have
function-scope semantics in IPA mode.
(find_what_var_points_to): Add fndecl parameter.
(find_what_p_points_to): Likewise.
(pt_solution_includes_global): Remove IPA PTA early out.
(compute_points_to_sets): Adjust.
(ipa_pta_execute): Likewise.  Clear final_solutions after
each function.

* gcc.dg/torture/ipa-pta-3.c: New testcase.
* g++.dg/ipa/ipa-pta-1.C: Likewise.

Index: gcc/tree-ssa-structalias.c
===
*** gcc/tree-ssa-structalias.c  (revision 231357)
--- gcc/tree-ssa-structalias.c  (working copy)
***
*** 159,167 
  
 The is_global_var bit which marks escape points is overly conservative
 in IPA mode.  Split it to is_escape_point and is_global_var - only
!externally visible globals are escape points in IPA mode.  This is
!also needed to fix the pt_solution_includes_global predicate
!(and thus ptr_deref_may_alias_global_p).
  
 The way we introduce DECL_PT_UID to avoid fixing up all points-to
 sets in the translation unit when we copy a DECL during inlining
--- 159,165 
  
 The is_global_var bit which marks escape points is overly conservative
 in IPA mode.  Split it to is_escape_point and is_global_var - only
!externally visible globals are escape points in IPA mode.
  
 The way we introduce DECL_PT_UID to avoid fixing up all points-to
 sets in the translation unit when we copy a DECL during inlining
***
*** 186,191 
--- 184,190 
 propagating it simply like the clobber / uses solutions.  The
 solution can go alongside the non-IPA espaced solution and be
 used to query which vars escape the unit through a function.
+This is also required to make the escaped-HEAP trick work in IPA mode.
  
 We never put function decls in points-to sets so we do not
 keep the set of called functions for indirect calls.
*** shared_bitmap_add (bitmap pt_vars)
*** 6101,6107 
  /* Set bits in INTO corresponding to the variable uids in solution set FROM.  
*/
  
  static void
! set_uids_in_ptset (bitmap into, bitmap from, struct pt_solution *pt)
  {
unsigned int i;
bitmap_iterator bi;
--- 6100,6107 
  /* Set bits in INTO corresponding to the variable uids in solution set FROM.  
*/
  
  static void
! set_uids_in_ptset (bitmap into, bitmap from, struct pt_solution *pt,
!  tree fndecl)
  {
unsigned int i;
bitmap_iterator bi;
*** set_uids_in_ptset (bitmap into, bitmap f
*** 6139,6145 
  /* Add the decl to the points-to set.  Note that the points-to
 set contains global variables.  */
  bitmap_set_bit (into, DECL_PT_UID (vi->decl));
! if (vi->is_global_var)
pt->vars_contains_nonlocal = true;
}
  }
--- 6139,6157 
  /* Add the decl to the points-to set.  Note that the points-to
 set contains global variables.  */
  bitmap_set_bit (into, DECL_PT_UID (vi->decl));
! if (vi->is_global_var
! /* In IPA mode the escaped_heap trick doesn't work as
!ESCAPED is escaped from the unit but
!pt_solution_includes_global needs to answer true for
!all variables not automatic within a function.
!For the same reason is_global_var is not the
!correct flag to track - local variables from other
!functions also need to be considered global.
!Conveniently all HEAP vars are not put in function
!scope.  */
! || (in_ipa_mode
! && fndecl
! && ! auto_var_in_fn_p (vi->decl, fndecl)))
pt->vars_contains_nonlocal = true;

Re: Splitting up gcc/omp-low.c?

On Wed, Dec 09, 2015 at 06:23:22PM +0100, Bernd Schmidt wrote:
> On 12/09/2015 05:24 PM, Thomas Schwinge wrote:
> >
> >In addition to that, how about we split up gcc/omp-low.c into several
> >files?  Would it make sense (I have not yet looked in detail) to do so
> >along the borders of the several passes defined therein?  Or, can you
> >tell already that there would be too many cross-references between the
> >several files to make this infeasible?
> 
> It would be nice to get rid of all the code duplication in that file. That
> alone could reduce the size by quite a bit, and hopefully make it easier to
> read.

What exact code duplication do you mean?

> I suspect a split along the ompexp/omplow boundary would be quite easy to
> achieve.

Yeah, that might be the possible splitting boundary (have omp-low.c,
omp-exp.c).

> >I'd suggest to do this shortly before GCC 6 is released, so that
> >backports from trunk to gcc-6-branch will be easy.  (I assume we don't
> >have to care for gcc-5-branch backports too much any longer.)
> 
> I'll declare myself agnostic as to whether such a change is appropriate for
> gcc-6 at this stage. I guess it kind of depends on the specifics.

Certainly.  On one side I'd say it is too late now in stage3, on the other
side when would be better time to do that, during stage1 people will have
more likely out of the tree branches with more changes (I'm aware we even
now have the HSA, OpenMP -> PTX and OpenACC branches).

So, if somebody wants to try that, we can see if the result would be
appropriate.

Jakub

Re: [PATCH] Fix ICE with -fno-if-conversion (PR rtl-optimization/68730)


On 12/10/2015 01:11 PM, Jakub Jelinek wrote:

Hi!

It seems some passes in between the combiner and ira aren't prepared to
update dominance info.  It usually is not a problem, because already before
the combiner we call free_dominance_info.  But we now have a new i?86
stv pass that is injected after the combiner that computes dominators but
does not free them.

So, to fix ICE on the following testcase, we can either do what the patch
does, or could conditionalize both the calculate_dominance_info and
free_dominance_info in the convert_scalars_to_vector function (stv pass)
on the dominance info not being computed (like other places in gcc do),
or we could stick free_dominance_info into all passes that break the
dominators just in case it would be computed (out_of_cfglayout is one
example).

Bootstrapped/regtested on x86_64-linux and i686-linux, is this ok for trunk
(or some other variant is preferrable)?

2015-12-10  Jakub Jelinek  

PR rtl-optimization/68730
* config/i386/i386.c (convert_scalars_to_vector): Call
free_dominance_info at the end.

* gcc.dg/pr68730.c: New test.
Any pass that mucks up the dominator tree ought to be wiping it clean. 
It's obviously better if wiping the dominator tree is conditional on the 
pass actually making transformations to the CFG that invalidate the 
stored information.


Similarly, any pass that needs the dominator tree ought to make sure 
it's around.  Note this is cheap if the prior pass hasn't wiped the 
dominator tree.


At least that's always been my understanding.

So ISTM this patch is the right thing to do in and of itself, though it 
may not be complete as there may be passes that aren't following the 
rules noted above.



jeff

Re: [PATCH, testsuite] Fix PR68629: attr-simd-3.c failure on arm-none-eabi targets


On 12/09/2015 02:56 AM, Thomas Preud'homme wrote:

c-c++-common/attr-simd-3.c fails to compile on arm-none-eabi targets due to 
-fcilkplus needing -pthread which is not available for those targets. This 
patch solves this issue by adding a condition to the cilkplus effective target 
that compiling with -fcilkplus succeeds and requires cilkplus as an effective 
target for attr-simd-3.c testcase.

ChangeLog entry is as follows:


*** gcc/testsuite/ChangeLog ***

2015-12-08  Thomas Preud'homme  

 PR testsuite/68629
 * lib/target-supports.exp (check_effective_target_cilkplus): Also
 check that compiling with -fcilkplus does not give an error.
 * c-c++-common/attr-simd-3.c: Require cilkplus effective target.

OK.

Note however, that the simd attribute is now independent of Cilk+.  So 
generally we shouldn't want/need -fcilkplus for uses of that attribute. 
 This case is somewhat special in that we're checking for something 
that's considered a syntax error for Cilk+, so it probably makes sense 
to keep the test as-is.


I do wonder if a complementary test where we try to apply that attribute 
to a function (without the vector attribute) and compile without the 
-fcilkplus option would be wise.  We ought to be throwing some kind of 
error in that situation and it'd be useful to verify that's the case.


jeff

RFA (hash-*): PATCH for c++/68309

2015-12-10 Thread Jason Merrill

The C++ front end uses a temporary hash table to remember 
specializations of local variables during template instantiations.  In a 
nested function such as a lambda or local class member function, we need 
to retain the elements from the enclosing function's 
local_specializations table; otherwise the testcase crashes because we 
don't find a local specialization for the non-captured use of 'args' in 
the decltype.


This patch addresses that by making a copy of the enclosing 
local_specializations table if it exists; to enable that I've added copy 
constructors to hash_table and hash_map.


Tested x86_64-pc-linux-gnu.  OK for trunk?
commit 2dfc74fa638b7255658f194c82308e8bf8ada9f9
Author: Jason Merrill 
Date:   Thu Dec 10 15:37:50 2015 -0500

	PR c++/68309
gcc/
	* hash-table.h: Add copy constructor.
	* hash-map.h: Add copy constructor.
gcc/cp/
	* pt.c (instantiate_decl): Copy local_specializations for nested
	function.

diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 0e087a6..5db3a32 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -21725,8 +21725,13 @@ instantiate_decl (tree d, int defer_ok,
 	 template from within the body of another.  */
   saved_local_specializations = local_specializations;
 
-  /* Set up the list of local specializations.  */
-  local_specializations = new hash_map;
+  /* Set up the list of local specializations, copying the current
+	 list if there is one.  */
+  if (local_specializations)
+	local_specializations
+	  = new hash_map (*local_specializations);
+  else
+	local_specializations = new hash_map;
 
   /* Set up context.  */
   if (DECL_OMP_DECLARE_REDUCTION_P (code_pattern)
diff --git a/gcc/hash-map.h b/gcc/hash-map.h
index b83708c..9b3d1e9 100644
--- a/gcc/hash-map.h
+++ b/gcc/hash-map.h
@@ -110,6 +110,11 @@ public:
 		 bool gather_mem_stats = true CXX_MEM_STAT_INFO)
 : m_table (n, ggc, gather_mem_stats, HASH_MAP_ORIGIN PASS_MEM_STAT) {}
 
+  hash_map (const hash_map , bool ggc = false,
+	bool gather_mem_stats = true CXX_MEM_STAT_INFO)
+: m_table (h.m_table, ggc, gather_mem_stats,
+	   HASH_MAP_ORIGIN PASS_MEM_STAT) {}
+
   /* Create a hash_map in ggc memory.  */
   static hash_map *create_ggc (size_t size, bool gather_mem_stats = true
 			   CXX_MEM_STAT_INFO)
diff --git a/gcc/hash-table.h b/gcc/hash-table.h
index 192be30..d172841 100644
--- a/gcc/hash-table.h
+++ b/gcc/hash-table.h
@@ -364,6 +364,10 @@ public:
   explicit hash_table (size_t, bool ggc = false, bool gather_mem_stats = true,
 		   mem_alloc_origin origin = HASH_TABLE_ORIGIN
 		   CXX_MEM_STAT_INFO);
+  hash_table (const hash_table &, bool ggc = false,
+	  bool gather_mem_stats = true,
+	  mem_alloc_origin origin = HASH_TABLE_ORIGIN
+	  CXX_MEM_STAT_INFO);
   ~hash_table ();
 
   /* Create a hash_table in gc memory.  */
@@ -580,6 +584,27 @@ hash_table::hash_table (size_t size, bool ggc, bool
 }
 
 template class Allocator>
+hash_table::hash_table (const hash_table , bool ggc,
+	   bool gather_mem_stats,
+	   mem_alloc_origin origin
+	   MEM_STAT_DECL) :
+m_n_elements (h.m_n_elements), m_n_deleted (h.m_n_deleted),
+m_searches (0), m_collisions (0),
+m_ggc (ggc), m_gather_mem_stats (gather_mem_stats)
+{
+  size_t size = h.m_size;
+
+  if (m_gather_mem_stats)
+hash_table_usage.register_descriptor (this, origin, ggc
+	  FINAL_PASS_MEM_STAT);
+
+  m_entries = alloc_entries (size PASS_MEM_STAT);
+  memcpy (m_entries, h.m_entries, size * sizeof (value_type));
+  m_size = size;
+  m_size_prime_index = h.m_size_prime_index;
+}
+
+template class Allocator>
 hash_table::~hash_table ()
 {
   for (size_t i = m_size - 1; i < m_size; i--)
diff --git a/gcc/testsuite/g++.dg/cpp0x/lambda/lambda-variadic3.C b/gcc/testsuite/g++.dg/cpp0x/lambda/lambda-variadic3.C
new file mode 100644
index 000..76b6b3f
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/lambda/lambda-variadic3.C
@@ -0,0 +1,9 @@
+// PR c++/68309
+// { dg-do compile { target c++11 } }
+
+template  void f(Ts...);
+template  T g(T);
+template  void print(Ts... args) {
+  [&] { f(g(args)...); }();
+}
+int main() { print(5.2); }

Re: Free up bits in DECLs and TYPEs

2015-12-10 Thread DJ Delorie


I'm OK with the msp430 part :-)

Re: [PATCH] Fix phiopt ICE in Factor conversion in COND_EXPR (PR tree-optimization/66949)


On 12/08/2015 09:21 AM, Marek Polacek wrote:

The following is a conservative fix for this PR.  This is an ICE transpiring
in the new "Factor conversion in COND_EXPR" optimization added in r225722.

Before this optimization kicks in, we have
   :
   ...
   p1_32 = (short unsigned int) _20;

   :
   ...
   iftmp.0_18 = (short unsigned int) _20;

   :
   ...
   # iftmp.0_19 = PHI 

after factor_out_conditional_conversion does its work, we end up with those two
def stmts removed and instead of the PHI we'll have

   # _35 = PHI <_20(3), _20(2)>
   iftmp.0_19 = (short unsigned int) _35;

That itself looks like a fine optimization, but after 
factor_out_conditional_conversion
there's
  320   phis = phi_nodes (bb2);
  321   phi = single_non_singleton_phi_for_edges (phis, e1, e2);
  322   gcc_assert (phi);
and phis look like
   b.2_38 = PHI 
   _35 = PHI <_20(3), _20(2)>
so single_non_singleton_phi_for_edges returns NULL and the subsequent assert 
triggers.

Cute.


With this patch we won't ICE (and PRE should clean this up anyway), but I don't 
know,
maybe I should try harder to optimize even this problematical case (not sure 
how hard
it would be...)?
So if this kind of situation is created by the first phiopt, then DOM 
should fix it.  As you note PRE will catch it if the second phiopt pass 
creates this degenerate PHI.  If created by the last phiopt pass, then 
hopefully coalescing would save us.


Jeff

Re: [PATCH 6/6] [DJGPP] configure.ac: enable LTO


On 12/05/2015 10:25 AM, Andris Pavenis wrote:

Patch enables LTO support for DJGPP in top level configure.ac

Andris

2015-12-05 Andris Pavenis 

* configure.ac: enable LTO for *-*-msdosdjgpp
OK once prereqs have gone in.  Note you should to the autoconf dance to 
update the generated files.  Mention them in your ChangeLog as

* configure: Regenerated

jeff

Re: [PATCH 1/6] [DJGPP] libstdc++-v3/config/os/djgpp/error_constants.h: Update according to errno codes available for DJGPP


On 12/05/2015 11:36 AM, Andris Pavenis wrote:

On 12/05/2015 06:35 PM, Andris Pavenis wrote:

Included patch updates libstdc++-v3/config/os/djgpp/error_constants.h
according to defines available in DJGPP errno.h.

I'm reposting a patch as earlier post (Nov 15, 2015) have bug in
changelog entry

Andris

2015-12-05 Andris Pavenis 

* config/os/djgpp/error_constants.h: update according to DJGPP errno
macros.
I can't really judge this one.  Either DJ or Jon would need to some in 
on this.


Jeff

Re: [PATCH 2/6] [DJGPP] libgcc: Add djgpp to lists of i[34567]86-- and x86_64-- soft-fp targets.


On 12/05/2015 10:05 AM, Andris Pavenis wrote:

Included patch adds *-*-msdosdjgpp to lists of i[34567]86-*-* soft-fp
targets.

Andris

PS. Sending from different address as posts from other address seems not
to go through

2015-12-05 Andris Pavenis 

*  config.host: Add *-*-msdosdjgpp to lists of i[34567]86-*-* soft-fp
targets

OK.
jeff

Re: [PATCH 3/6] [DJGPP] libbacktrace/configure.ac: specify that DJGPP do not have mmap()


On 12/05/2015 10:14 AM, Andris Pavenis wrote:

Patch specifies that DJGPP do not have mmap() even when sys/mman.h is
available.
libbacktrace is perhaps currently unusable for DJGPP but otherwise
mmap() usability
misdetection breaks cross-compiler build for DJGPP target (for example
Linux hosted
cross-compiler)

Andris

2015-12-05 Andris Pavenis 

* configure.ac: Specify that DJGPP do not have mmap even when sys/mman.h
exists
* configure: Regenerate

OK.
jeff

[PATCH] Fix PR c++/21802 (two-stage name lookup fails for operators)

2015-12-10 Thread Patrick Palka

This patch fixes name-lookup of operators in template definitions whose
operands are non-dependent expressions, i.e. PR c++/21802 (and
incidentally 53223).

The approach that this patch takes is to detect when build_new_op()
returns a call to an overloaded function and to store a call to this
overload intothe template AST instead of storing the raw operator
(an operator would be erroneously subject to overload resolution during
instantiation).

The new function build_min_non_dep_op_overload is the workhorse of the
patch.  It reconstructs the CALL_EXPR that would have been built had an
explicit operator+, operator* etc call been used, i.e. had the overload
gone through finish_call_expr() / build_new_method_call() instead of
through build_new_op().  The parameter OVERLOAD of this new function is
probably not strictly necessary -- one can probably just look at the
CALL_EXPR_FN of the parameter NON_DEP to figure out the overload to use
-- but since the requisite plumbing from build_new_op() already existed
to conveniently get at the overload information I thought I might as
well use it.

I have also created a test case that hopefully exercises all the changes
that were made and to verify that these operator calls are being built
correctly.

Does this approach seem adequate?  Bootstrap and regtesting in progress
on x86_64, OK to commit if testing succeeds?

gcc/cp/ChangeLog:

PR c++/21802
PR c++/53223
* cp-tree.h (build_min_non_dep_op_overload): Declare.
* tree.c (build_min_non_dep_op_overload): Define.
* typeck.c (build_x_indirect_ref): Use
build_min_non_dep_op_overload when the given expression
has been resolved to an operator overload.
(build_x_binary_op): Likewise.
(build_x_array_ref): Likewise.
(build_x_unary_op): Likewise.
(build_x_compound_expr): Likewise.
(build_x_modify_expr): Likewise.

gcc/testsuite/ChangeLog:

PR c++/21802
PR c++/53223
* g++.dg/cpp0x/pr53223.C: New test.
* g++.dg/lookup/pr21802.C: New test.
* g++.dg/lookup/two-stage4.C: Remove XFAIL.
---
 gcc/cp/cp-tree.h |   1 +
 gcc/cp/tree.c|  64 
 gcc/cp/typeck.c  | 100 +---
 gcc/testsuite/g++.dg/cpp0x/pr53223.C |  35 
 gcc/testsuite/g++.dg/lookup/pr21802.C| 271 +++
 gcc/testsuite/g++.dg/lookup/two-stage4.C |   2 +-
 6 files changed, 453 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp0x/pr53223.C
 create mode 100644 gcc/testsuite/g++.dg/lookup/pr21802.C

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 6190f4e..3487d77 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -6513,6 +6513,7 @@ extern tree build_min (enum 
tree_code, tree, ...);
 extern tree build_min_nt_loc   (location_t, enum tree_code,
 ...);
 extern tree build_min_non_dep  (enum tree_code, tree, ...);
+extern tree build_min_non_dep_op_overload  (enum tree_code, tree, tree, 
...);
 extern tree build_min_non_dep_call_vec (tree, tree, vec 
*);
 extern tree build_cplus_new(tree, tree, tsubst_flags_t);
 extern tree build_aggr_init_expr   (tree, tree);
diff --git a/gcc/cp/tree.c b/gcc/cp/tree.c
index 5dad0a7..2635736 100644
--- a/gcc/cp/tree.c
+++ b/gcc/cp/tree.c
@@ -2744,6 +2744,70 @@ build_min_non_dep_call_vec (tree non_dep, tree fn, 
vec *argvec)
   return convert_from_reference (t);
 }
 
+/* Similar to build_min_non_dep, but for expressions that have been resolved to
+   a call to an operator overload.  OP is the operator that has been
+   overloaded.  NON_DEP is the non-dependent expression that's been built,
+   which should be a CALL_EXPR or an INDIRECT_REF to a CALL_EXPR.  OVERLOAD is
+   the overload that NON_DEP is calling.  */
+
+tree
+build_min_non_dep_op_overload (enum tree_code op,
+  tree non_dep,
+  tree overload, ...)
+{
+  va_list p;
+  int nargs;
+  tree fn, call;
+  vec *args;
+
+  if (REFERENCE_REF_P (non_dep))
+non_dep = TREE_OPERAND (non_dep, 0);
+
+  nargs = call_expr_nargs (non_dep);
+
+  if (op == PREINCREMENT_EXPR
+  || op == PREDECREMENT_EXPR)
+gcc_assert (nargs == 1);
+  else if (op == MODOP_EXPR)
+gcc_assert (nargs == 2);
+  else
+gcc_assert (nargs == TREE_CODE_LENGTH (op));
+
+  args = make_tree_vector ();
+  va_start (p, overload);
+
+  if (TREE_CODE (TREE_TYPE (overload)) == FUNCTION_TYPE)
+{
+  fn = overload;
+  for (int i = 0; i < nargs; i++)
+   {
+ tree arg = va_arg (p, tree);
+ vec_safe_push (args, arg);
+   }
+}
+  else if (TREE_CODE (TREE_TYPE (overload)) == METHOD_TYPE)
+{
+  tree object = va_arg (p, tree);
+  tree binfo =

Re: [PATCH] gcc: read -fdebug-prefix-map OLD from environment (improved reproducibility)

2015-12-10 Thread Daniel Kahn Gillmor

On Thu 2015-12-10 12:36:18 -0500, Daniel Kahn Gillmor wrote:
> Work on the reproducible-builds project [0] has identified that build
> paths are one cause of output variation between builds.  This
> changeset allows users to avoid this variation when building C objects
> with debug symbols, while leaving the default behavior unchanged.

I've now opened a bugzilla issue about this as well, with patch
attached:

   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68848

   --dkg

Re: [Patch, Fortran] PR68815 - replace '%s' quotes by %< ... %>

2015-12-10 Thread Manuel López-Ibáñez


On 12/09/2015 03:53 PM, Tobias Burnus wrote:

In principle, %<%c%> and %<%d%> should be convertable to %qc and
%qd (as the code is more readable), but the current function
annotation prevent this, telling that the q flag is not valid for
%c and %d. As %< is fine, I didn't dig into it.


You need to edit the gcc_gfc_* variables in c-family/c-format.c. The correct 
way to do that does not seem to be documented anywhere. I only got the current 
support working after painful trial and error.


Joseph, Jakub, could you advise the Fortran devs on how to edit c-format.c to 
handle %qc and %qd as the C/C++ FEs do?


Thanks,

Manuel

Re: [PATCH 4/6] [DJGPP] config/i386/djgpp: Update definitions of signed types


On 12/05/2015 10:19 AM, Andris Pavenis wrote:

Included patch updates typedefs of integer types in
config/i386/djgpp-stdint.h.
Patch is similar but not identical as attached to

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=41557

Andris

2015-12-05 Andris Pavenis 

* config/i386/djgpp-stdint.h: update typedefs for integer types

I'm going to trust these are right.  OK for the trunk.

jeff

[v3 PATCH] PR libstdc++/68139

2015-12-10 Thread Ville Voutilainen

Tested on Linux-PPC64.

2015-12-11  Ville Voutilainen  

PR libstdc++/68139

/libstdc++-v3
* libsupc++/nested_exception.h (_S_rethrow): Use __std::addressof.

/testsuite
* 18_support/nested_exception/68139.cc: New.
diff --git a/libstdc++-v3/libsupc++/nested_exception.h 
b/libstdc++-v3/libsupc++/nested_exception.h
index a716f75..82b95df 100644
--- a/libstdc++-v3/libsupc++/nested_exception.h
+++ b/libstdc++-v3/libsupc++/nested_exception.h
@@ -37,6 +37,7 @@
 #else
 
 #include 
+#include 
 
 #if ATOMIC_INT_LOCK_FREE < 2
 #  error This platform does not support exception propagation.
@@ -142,7 +143,8 @@ namespace std
 {
   static void _S_rethrow(const _Tp& __t)
   {
-   if (auto __tp = dynamic_cast(&__t))
+   if (auto __tp =
+dynamic_cast(std::__addressof(__t)))
  __tp->rethrow_nested();
   }
 };
diff --git a/libstdc++-v3/testsuite/18_support/nested_exception/68139.cc 
b/libstdc++-v3/testsuite/18_support/nested_exception/68139.cc
new file mode 100644
index 000..551f931
--- /dev/null
+++ b/libstdc++-v3/testsuite/18_support/nested_exception/68139.cc
@@ -0,0 +1,31 @@
+// { dg-do compile }
+// { dg-options "-std=gnu++11" }
+// { dg-require-atomic-builtins "" }
+
+// Copyright (C) 2015 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// .
+
+#include 
+
+// libstdc++/68139
+struct C {
+virtual ~C(){}
+void operator&() const = delete;
+};
+
+int main() { std::rethrow_if_nested(C()); }
+

Re: [Patch, Fortran] PR68815 - replace '%s' quotes by %< ... %>

2015-12-10 Thread Joseph Myers

On Thu, 10 Dec 2015, Manuel López-Ibáñez wrote:

> On 12/09/2015 03:53 PM, Tobias Burnus wrote:
> > In principle, %<%c%> and %<%d%> should be convertable to %qc and
> > %qd (as the code is more readable), but the current function
> > annotation prevent this, telling that the q flag is not valid for
> > %c and %d. As %< is fine, I didn't dig into it.
> 
> You need to edit the gcc_gfc_* variables in c-family/c-format.c. The correct
> way to do that does not seem to be documented anywhere. I only got the current
> support working after painful trial and error.
> 
> Joseph, Jakub, could you advise the Fortran devs on how to edit c-format.c to
> handle %qc and %qd as the C/C++ FEs do?

Put "q" in the first flags string for those formats in gcc_gfc_char_table.

-- 
Joseph S. Myers
jos...@codesourcery.com

Re: [PATCH] gcc: read -fdebug-prefix-map OLD from environment (improved reproducibility)

2015-12-10 Thread Daniel Kahn Gillmor

On Thu 2015-12-10 18:59:33 -0500, Joseph Myers wrote:
> On Thu, 10 Dec 2015, Daniel Kahn Gillmor wrote:
>
>> Specifically, if the first character of the "old" argument is a
>> literal $, then gcc will treat it as an environment variable name, and
>> use the value of the env var for prefix mapping.
>
> I don't think a literal $ in option arguments is a good idea; it's far too 
> hard to pass through a sequence of shells and makefiles that you typically 
> get in recursive make.  You end up with things like 
> '-Wl,-rpath,'\''\\\$$\$$\\\$$\$$ORIGIN'\''/../' (part of a process for 
> using $ORIGIN when linking GDB) if you try.

yow, that's truly monstrous!

Is there a different symbol or string you'd be OK using instead for the
same approach?  What about looking for an "ENV:" prefix?

so something like:

 -fdebug-prefix-map=ENV:SOURCE_BUILD_DIR=/usr/src

wdyt?  I could rework the patch pretty easily if that seems acceptable.

--dkg

Transparent alias suport part 11: fix warning and error attributes with LTO

2015-12-10 Thread Jan Hubicka

Hi,
this patch makes lto-symtab to not merge decls where warning and error
attributes mismatch and finally clears up the invalid wanring compiling
testcase from PR 61886.

So it took only 11 patches and year and half to fix this beast...

I am now finished with merging the original transparent alias patchset.  I am
going to do some bigger scale testing with LTO decl and tree merging disabled
and also try to provoke some wrong code from PTA that ought to need an update
(to track global vars correctly) but for some reason nothing I tested so far
cares.  This does not affect the existing uses of warning/error attributes
in glibc, so I think the fix for PR ipa/61886 does not need to wait for that.

Bootstrapped/regtested x86_64-linux, will commit it after lto-bootstrap
converges.

Honza

PR ipa/61886
* lto-symtab.c (lto_symtab_merge_p): Avoid merging across different
values of error and warning attributes.
* gcc.dg/lto/pr61886_0.c: New testcase
Index: lto/lto-symtab.c
===
--- lto/lto-symtab.c(revision 231539)
+++ lto/lto-symtab.c(working copy)
@@ -511,19 +511,59 @@ static bool
 lto_symtab_merge_p (tree prevailing, tree decl)
 {
   if (TREE_CODE (prevailing) != TREE_CODE (decl))
-return false;
+{
+  if (symtab->dump_file)
+   fprintf (symtab->dump_file, "Not merging decls; "
+"TREE_CODE mismatch\n");
+  return false;
+}
   if (TREE_CODE (prevailing) == FUNCTION_DECL)
 {
   if (DECL_BUILT_IN (prevailing) != DECL_BUILT_IN (decl))
-   return false;
+   {
+  if (symtab->dump_file)
+   fprintf (symtab->dump_file, "Not merging decls; "
+"DECL_BUILT_IN mismatch\n");
+ return false;
+   }
   if (DECL_BUILT_IN (prevailing)
  && (DECL_BUILT_IN_CLASS (prevailing) != DECL_BUILT_IN_CLASS (decl)
  || DECL_FUNCTION_CODE (prevailing) != DECL_FUNCTION_CODE (decl)))
-   return false;
+   {
+  if (symtab->dump_file)
+   fprintf (symtab->dump_file, "Not merging decls; "
+"DECL_BUILT_IN_CLASS or CODE mismatch\n");
+ return false;
+   }
+}
+  if (DECL_ATTRIBUTES (prevailing) != DECL_ATTRIBUTES (decl))
+{
+  tree prev_attr = lookup_attribute ("error", DECL_ATTRIBUTES 
(prevailing));
+  tree attr = lookup_attribute ("error", DECL_ATTRIBUTES (decl));
+  if ((prev_attr == NULL) != (attr == NULL)
+ || (prev_attr
+ && TREE_VALUE (TREE_VALUE (prev_attr))
+!= TREE_VALUE (TREE_VALUE (attr
+   {
+  if (symtab->dump_file)
+   fprintf (symtab->dump_file, "Not merging decls; "
+"error attribute mismatch\n");
+ return false;
+   }
+
+  prev_attr = lookup_attribute ("warning", DECL_ATTRIBUTES (prevailing));
+  attr = lookup_attribute ("warning", DECL_ATTRIBUTES (decl));
+  if ((prev_attr == NULL) != (attr == NULL)
+ || (prev_attr
+ && TREE_VALUE (TREE_VALUE (prev_attr))
+!= TREE_VALUE (TREE_VALUE (attr
+   {
+  if (symtab->dump_file)
+   fprintf (symtab->dump_file, "Not merging decls; "
+"warning attribute mismatch\n");
+ return false;
+   }
 }
-  /* There are several other cases where merging can not be done, but until
- aliasing code is fixed to support aliases it we can not really return
- false on non-readonly var, yet.  */
   return true;
 }
 
Index: testsuite/gcc.dg/lto/pr61886_0.c
===
--- testsuite/gcc.dg/lto/pr61886_0.c(revision 0)
+++ testsuite/gcc.dg/lto/pr61886_0.c(revision 0)
@@ -0,0 +1,33 @@
+/* { dg-lto-do link } */
+/* { dg-lto-options { { -flto -O2 -Werror } } } */
+
+typedef __SIZE_TYPE__ size_t;
+typedef struct _IO_FILE FILE;
+
+extern size_t __fread_chk (void *__restrict __ptr, size_t __ptrlen, size_t 
__size, size_t __n, FILE *__restrict __stream) __asm__ ("" "__fread_chk")  
__attribute__ ((__warn_unused_result__));
+extern size_t __fread_chk_warn (void *__restrict __ptr, size_t __ptrlen, 
size_t __size, size_t __n, FILE *__restrict __stream) __asm__ ("" 
"__fread_chk")  __attribute__ ((__warn_unused_result__)) 
__attribute__((__warning__ ("fread called with bigger size * nmemb than length 
" "of destination buffer")));
+
+extern __inline __attribute__ ((__always_inline__)) __attribute__ 
((__gnu_inline__)) __attribute__ ((__artificial__)) __attribute__ 
((__warn_unused_result__))
+size_t
+fread (void *__restrict __ptr, size_t __size, size_t __n,
+   FILE *__restrict __stream)
+{
+  if (__builtin_object_size (__ptr, 0) != (size_t) -1)
+{
+  if (!__builtin_constant_p (__size)
+  || !__builtin_constant_p (__n)
+  || (__size | __n) >= (((size_t) 1) << (8 * sizeof (size_t) / 2)))
+return __fread_chk (__ptr,

Re: [PATCH] rtlanal: Fix bits/bytes confusion in set_noop_p (PR68814)

2015-12-10 Thread Segher Boessenkool

On Thu, Dec 10, 2015 at 01:26:06PM +0100, Eric Botcazou wrote:
> Thanks.  It seems a little odd for the condition to test the POS (operand #2) 
> and not also the LEN (operand #1) of the ZERO_EXTRACT before returning true, 
> but I'm not sure what the test would be given the above example.  Or maybe 
> it's implicitly contained in the POS test because of the little-endian case.

The test looks weird because it only handles the LE case.  What it says
is simply "if copying the rightmost part of a register to itself", and
it then does not matter how many bits are copied.

This really does not belong here I'd say (whatever creates this RTL should
already simplify it), but I'm just fixing a bug ;-)


Segher

Do not decompress functions sections when copying them to ltrans

2015-12-10 Thread Jan Hubicka

Hi,
this patch makes WPA to copy sections w/o decompressing them.  This leads
to a nice /tmp usage for GCC bootstrap (about 70%) and little for Firefox.
In GCC about 5% of the ltrans object file is the global decl section, while
for Firefox it is 85%.  I will try to figure out if there is something
terribly stupid pickled there.

The patch simply adds raw section i/o to lto-section-in.c and lto-section-out.c
which is used by copy_function_or_variable.  The catch is that WPA->ltrans
stremaing is not compressed and this fact is not represented in the object file
at all.  We simply test flag_wpa and flag_ltrans.  Now function sections born
at WPA time are uncompressed, while function sections just copied are
compressed and we do not know how to read them.

I tried to simply turn off the non-compressed path and set compression level
to minimal and then to none (which works despite the apparently outdated FIXME
comments I removed).  Sadly zlib manages to burn about 16% of WPA time
at minimal level and about 7% at none because it computes the checksum. Clealry
next stage1 it is time to switch to better compression backend.

For now I added the information if section is compressed into decl_state.  I am
not thrilled by this but it is only way I found w/o wasting 4 bytes per every
lto section (because the lto header is not really extensible and the stream is
assumed to be aligned).

The whole lowlevel lto streaming code is grand mess, I hope we will clean this
up and get more sane headers in foreseable future. Until that time this
solution does not waste extra space as it is easy to pickle the flag as part of
reference.

The patch saves about 7% of WPA time for firefox:

 phase opt and generate  :  75.66 (39%) usr   1.78 (14%) sys  77.44 (37%) wall  
855644 kB (21%) ggc
 phase stream in :  34.62 (18%) usr   1.95 (16%) sys  36.57 (18%) wall 
3245604 kB (79%) ggc
 phase stream out:  81.89 (42%) usr   8.49 (69%) sys  90.37 (44%) wall  
50 kB ( 0%) ggc
 ipa dead code removal   :   4.33 ( 2%) usr   0.06 ( 0%) sys   4.24 ( 2%) wall  
 0 kB ( 0%) ggc
 ipa virtual call target :  25.15 (13%) usr   0.14 ( 1%) sys  25.42 (12%) wall  
 0 kB ( 0%) ggc
 ipa cp  :   3.92 ( 2%) usr   0.21 ( 2%) sys   4.18 ( 2%) wall  
340698 kB ( 8%) ggc
 ipa inlining heuristics :  24.12 (12%) usr   0.38 ( 3%) sys  24.37 (12%) wall  
500427 kB (12%) ggc
 lto stream inflate  :   7.07 ( 4%) usr   0.38 ( 3%) sys   7.33 ( 4%) wall  
 0 kB ( 0%) ggc
 ipa lto gimple in   :   1.95 ( 1%) usr   0.61 ( 5%) sys   2.42 ( 1%) wall  
324875 kB ( 8%) ggc
 ipa lto gimple out  :   9.16 ( 5%) usr   1.64 (13%) sys  10.49 ( 5%) wall  
50 kB ( 0%) ggc
 ipa lto decl in :  21.25 (11%) usr   1.01 ( 8%) sys  22.37 (11%) wall 
2348869 kB (57%) ggc
 ipa lto decl out:  67.33 (34%) usr   1.66 (13%) sys  68.96 (33%) wall  
 0 kB ( 0%) ggc
 ipa lto constructors out:   1.39 ( 1%) usr   0.38 ( 3%) sys   2.18 ( 1%) wall  
 0 kB ( 0%) ggc
 ipa lto decl merge  :   2.12 ( 2%) usr   0.00 ( 0%) sys   2.12 ( 2%) wall  
 13737 kB ( 0%) ggc
 ipa reference   :   2.14 ( 2%) usr   0.00 ( 0%) sys   2.13 ( 2%) wall  
 0 kB ( 0%) ggc
 ipa pure const  :   2.29 ( 2%) usr   0.01 ( 0%) sys   2.35 ( 2%) wall  
 0 kB ( 0%) ggc
 ipa icf :   9.02 ( 7%) usr   0.18 ( 2%) sys   9.72 ( 7%) wall  
 19203 kB ( 0%) ggc
 TOTAL : 195.2712.37   207.64
4103297 kB


 phase opt and generate  :  79.00 (38%) usr   1.61 (13%) sys  80.61 (36%) wall 
1000597 kB (24%) ggc
 phase stream in :  33.93 (16%) usr   1.91 (15%) sys  35.83 (16%) wall 
3242293 kB (76%) ggc
 phase stream out:  96.90 (46%) usr   9.19 (72%) sys 106.09 (48%) wall  
52 kB ( 0%) ggc
 garbage collection  :   2.94 ( 1%) usr   0.00 ( 0%) sys   2.93 ( 1%) wall  
 0 kB ( 0%) ggc
 ipa dead code removal   :   4.60 ( 2%) usr   0.04 ( 0%) sys   4.53 ( 2%) wall  
 0 kB ( 0%) ggc
 ipa virtual call target :  24.48 (12%) usr   0.14 ( 1%) sys  24.76 (11%) wall  
 0 kB ( 0%) ggc
 ipa cp  :   4.92 ( 2%) usr   0.41 ( 3%) sys   5.31 ( 2%) wall  
502843 kB (12%) ggc
 ipa inlining heuristics :  23.72 (11%) usr   0.23 ( 2%) sys  23.92 (11%) wall  
490927 kB (12%) ggc
 lto stream inflate  :  14.35 ( 7%) usr   0.35 ( 3%) sys  15.22 ( 7%) wall  
 0 kB ( 0%) ggc
 ipa lto gimple in   :   1.79 ( 1%) usr   0.57 ( 4%) sys   2.46 ( 1%) wall  
324857 kB ( 8%) ggc
 ipa lto gimple out  :   9.98 ( 5%) usr   1.45 (11%) sys  11.05 ( 5%) wall  
52 kB ( 0%) ggc
 ipa lto decl in :  21.01 (10%) usr   0.91 ( 7%) sys  21.90 (10%) wall 
2345561 kB (55%) ggc
 ipa lto decl out:  73.55 (35%) usr   2.09 (16%) sys  75.67 (34%) wall  
 0 kB ( 0%) ggc
 ipa lto constructors

Re: [PATCH 5/6] [DJGPP] gcc/config/i386: update DJGPP configuration related files


On 12/05/2015 10:22 AM, Andris Pavenis wrote:

Patch include updates to DJGPP related files in gcc/config/i386.

There are too many changes to gcc/config/i386/djgpp.h and
gcc/config/i386/xm-djgpp.h to list them completely in changelog
entry.

Andris

2015-11-25 Andris Pavenis 

Subject: [PATCH 6/6] [DJGPP] gcc/config/i386: update DJGPP configuration
  related files

* gcc/config/i386/djgpp.h: update
* gcc/config/i386/xm-djgpp.h: update
* gcc/config/i386/djgpp.c: new file
* gcc/config/i386/djgpp.opt: remove obsolete option -mbnu210
* gcc/config/i386/t-djgpp: new file
* gcc/config.gcc: use i386/t-djgpp for DJGPP target

0005-DJGPP-gcc-config-i386-update-DJGPP-configuration-rel.patch


 From 367432d08b4a00f180c3d0710a0f34cda1b6 Mon Sep 17 00:00:00 2001
From: Andris Pavenis
Date: Sat, 5 Dec 2015 08:49:12 +0200
Subject: [PATCH 5/6] [DJGPP] gcc/config/i386: update DJGPP configuration 
related files

* gcc/config/i386/djgpp.h: update
* gcc/config/i386/xm-djgpp.h: update
You should describe what you updated.  SImply saying "update" is 
insufficient here.

diff --git a/gcc/config/i386/djgpp.c b/gcc/config/i386/djgpp.c
new file mode 100644
index 000..29dad33
--- /dev/null
+++ b/gcc/config/i386/djgpp.c
@@ -0,0 +1,50 @@
+/* Subroutines for DJGPP.
+   Contributed by Andris Pavenis
+   Copyright (C) 2013 Free Software Foundation, Inc.

Copyright date needs updating.


+
+void
+i386_djgpp_asm_named_section(const char *name, unsigned int flags,
+tree decl)
+{
+  char flagchars[8], *f = flagchars;
+
+(void)decl; /* silence warning. */
The way to do this now is to not give a name to the parameter.So 
remove "decl" from the function signature (leaving its type though). 
Then you can remove this line.



--- a/gcc/config/i386/djgpp.h
+++ b/gcc/config/i386/djgpp.h
@@ -17,12 +17,27 @@ You should have received a copy of the GNU General Public 
License
  along with GCC; see the file COPYING3.  If not see
  .  */

+#define DBX_DEBUGGING_INFO 1
+#define SDB_DEBUGGING_INFO 1
?  Really,  do you *really* need this.  I'd really like to see DBX and 
SDB debugging info die, not propagate into more targets.




-#define IX86_MAYBE_NO_LIBGCC_TFMODE
+#undef DBX_REGISTER_NUMBER
+#define DBX_REGISTER_NUMBER(n) svr4_dbx_register_map[n]

And if the DBX_DEBUGGING_INFO goes away, can't this go away too?

Jeff

Re: [PATCH] gcc: read -fdebug-prefix-map OLD from environment (improved reproducibility)

2015-12-10 Thread Joseph Myers

On Thu, 10 Dec 2015, Daniel Kahn Gillmor wrote:

> Specifically, if the first character of the "old" argument is a
> literal $, then gcc will treat it as an environment variable name, and
> use the value of the env var for prefix mapping.

I don't think a literal $ in option arguments is a good idea; it's far too 
hard to pass through a sequence of shells and makefiles that you typically 
get in recursive make.  You end up with things like 
'-Wl,-rpath,'\''\\\$$\$$\\\$$\$$ORIGIN'\''/../' (part of a process for 
using $ORIGIN when linking GDB) if you try.

-- 
Joseph S. Myers
jos...@codesourcery.com

Re: [PATCH 1/6] [DJGPP] libstdc++-v3/config/os/djgpp/error_constants.h: Update according to errno codes available for DJGPP

2015-12-10 Thread DJ Delorie


> I can't really judge this one.  Either DJ or Jon would need to some in 
> on this.

Looks OK to me, although in the default configuration (plain DJGPP)
the #ifdefs will always be false (omitted), which is harmless.

[PATCH] Fix PR c++/68831 (superfluous -Waddress warning for C++ delete)

2015-12-10 Thread Patrick Palka

Is this OK to commit if bootstrap + regtest on x86_64 succeeds?

gcc/cp/ChangeLog:

PR c++/68831
* init.c (build_delete): Use a warning sentinel to disable
-Waddress warnings when building the conditional that tests
if the operand is NULL.

gcc/testsuite/ChangeLog:

PR c++/68831
* g++.dg/pr68831.C: New test.
---
 gcc/cp/init.c  |  1 +
 gcc/testsuite/g++.dg/pr68831.C | 10 ++
 2 files changed, 11 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/pr68831.C

diff --git a/gcc/cp/init.c b/gcc/cp/init.c
index 5ecf9fb..2fffc61 100644
--- a/gcc/cp/init.c
+++ b/gcc/cp/init.c
@@ -4439,6 +4439,7 @@ build_delete (tree otype, tree addr, 
special_function_kind auto_delete,
   else
{
  /* Handle deleting a null pointer.  */
+ warning_sentinel s (warn_address);
  ifexp = fold (cp_build_binary_op (input_location,
NE_EXPR, addr, nullptr_node,
complain));
diff --git a/gcc/testsuite/g++.dg/pr68831.C b/gcc/testsuite/g++.dg/pr68831.C
new file mode 100644
index 000..8d32819
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr68831.C
@@ -0,0 +1,10 @@
+// PR c++/68831
+// { dg-options "-Waddress" }
+
+class DenseMap {
+public:
+  ~DenseMap();
+};
+extern const DenseMap 
+void foo() { delete  }
+
-- 
2.6.4.491.gda30757.dirty

Re: [PATCH 3/7][ARM] Add patterns for new instructions

On Mon, Dec 7, 2015 at 4:06 PM, Matthew Wahab
 wrote:
> Ping. Updated patch attached.
> Matthew
>
>
> On 26/11/15 16:00, Matthew Wahab wrote:
>>
>> Hello,
>>
>> This patch adds patterns for the instructions, vqrdmlah and vqrdmlsh,
>> introduced in the ARMv8.1 architecture. The instructions are made
>> available when -march=armv8.1-a is enabled with suitable fpu settings,
>> such as -mfpu=neon-fp-armv8 -mfloat-abi=hard.
>>
>> Tested the series for arm-none-eabi with cross-compiled check-gcc on an
>> ARMv8.1 emulator. Also tested arm-none-linux-gnueabihf with native
>> bootstrap and make check.
>>
>> Ok for trunk?
>> Matthew

OK.

Ramana

>>
>> gcc/
>> 2015-11-26  Matthew Wahab  
>>
>>  * config/arm/iterators.md (VQRDMLH_AS): New.
>>  (neon_rdma_as): New.
>>  * config/arm/neon.md
>>  (neon_vqrdmlh): New.
>>  (neon_vqrdmlh_lane): New.
>>  * config/arm/unspecs.md (UNSPEC_VQRDMLAH): New.
>>  (UNSPEC_VQRDMLSH): New.
>>
>

Re: [PATCH 2/7][ARM] Multilib support for ARMv8.1.

On Mon, Dec 7, 2015 at 4:05 PM, Matthew Wahab
 wrote:
> Ping. Updated patch attached.
> Matthew
>
>
> On 26/11/15 15:58, Matthew Wahab wrote:
>>
>> This patch sets up multilib support for ARMv8.1, treating it as a
>> synonym for ARMv8. Since ARMv8.1 integer, FP or SIMD
>> instructions are only generated for the new, instruction-specific
>> instrinsics, mapping to ARMv8 rather than adding a new multilib variant
>> is sufficient.
>>
>> Tested the series for arm-none-eabi with cross-compiled check-gcc on an
>> ARMv8.1 emulator. Also tested arm-none-linux-gnueabihf with native
>> bootstrap and make check.
>>
>> Ok for trunk?
>> Matthew
>>
>> gcc/
>> 2015-11-26  Matthew Wahab  
>>
>>  * config/arm/t-aprofile: Make "armv8.1-a" and "armv8.1-a+crc"
>>  matches for "armv8-a".
>>


OK.

Ramana
>

Re: [PATCH 1/7][ARM] Add support for ARMv8.1.

On Mon, Dec 7, 2015 at 4:04 PM, Matthew Wahab
 wrote:
> Ping. Updated patch attached.
> Matthew
>
>
> On 26/11/15 15:55, Matthew Wahab wrote:
>>
>> Hello,
>>
>>
>> ARMv8.1 includes an extension to ARM which adds two Adv.SIMD
>> instructions, vqrdmlah and vqrdmlsh. This patch set adds support for
>> ARMv8.1 and for the new instructions, enabling the architecture with
>> --march=armv8.1-a. The new instructions are enabled when both ARMv8.1
>> and a suitable fpu options are set, for instance with -march=armv8.1-a
>> -mfpu=neon-fp-armv8 -mfloat-abi=hard.
>>
>> This patch set adds the command line options and internal feature
>> macros. Following patches
>> - enable multilib support for ARMv8.1,
>> - add patterns for the new instructions,
>> - add the ACLE feature macro for the ARMv8.1 extensions,
>> - extend target support in the testsuite to ARMv8.1,
>> - add the ACLE intrinsics for vqrmdl{as}h and
>> - add the ACLE intrinsics for vqrmdl{as}h_lane.
>>
>> Tested the series for arm-none-eabi with cross-compiled check-gcc on an
>> ARMv8.1 emulator. Also tested arm-none-linux-gnueabihf with native
>> bootstrap and make check.
>>
>> Is this ok for trunk?
>> Matthew
>>
>> gcc/
>> 2015-11-26  Matthew Wahab  
>>
>>  * config/arm/arm-arches.def: Add "armv8.1-a" and "armv8.1-a+crc".
>>  * config/arm/arm-protos.h (FL2_ARCH8_1): New.
>>  (FL2_FOR_ARCH8_1A): New.
>>  * config/arm/arm-tables.opt: Regenerate.
>>  * config/arm/arm.c (arm_arch8_1): New.
>>  (arm_option_override): Set arm_arch8_1.
>>  * config/arm/arm.h (TARGET_NEON_RDMA): New.
>>  (arm_arch8_1): Declare.
>>  * doc/invoke.texi (ARM Options, -march): Add "armv8.1-a" and
>>  "armv8.1-a+crc".
>>  (ARM Options, -mfpu): Fix a typo.
>
>

OK.

Re: [PATCH 5/7][Testsuite] Support ARMv8.1 ARM tests.

On Mon, Dec 7, 2015 at 4:10 PM, Matthew Wahab
 wrote:
> On 27/11/15 17:11, Matthew Wahab wrote:
>>
>> On 27/11/15 13:44, Christophe Lyon wrote:

 On 26/11/15 16:02, Matthew Wahab wrote:
>>
>>
> This patch adds ARMv8.1 support to GCC Dejagnu, to allow ARM
> tests to specify targest and to set up command line options.
> It builds on the ARMv8.1 target support added for AArch64 tests, partly
> reworking that support to take into account the different
> configurations
> that tests may be run under.
>>
>>
>>> I may be mistaken, but -mfpu=neon-fp-armv8 and -mfloat-abi=softfp are not
>>> supported by aarch64-gcc. So it seems to me that
>>> check_effective_target_arm_v8_1a_neon_ok_nocache will not always work
>>> for aarch64 after your patch.
>>
>>
>>> Or does it work because no option is needed and thus "" always
>>> matches and thus the loop always exits after the first iteration
>>> on aarch64?
>>
>>
>> Yes, the idea is that the empty string will make the function first try
>> '-march=armv8.1-a' without any other flag. That will work for AArch64
>> because it
>> doesn't need any other option.
>>
>>> Maybe a more accurate comment would help remembering that, in case
>>> -mfpu option becomes necessary for aarch64.
>>>
>>
>> Agreed, it's worth having a comment to explain what the 'foreach'
>> construct is doing.
>>
>> Matthew
>
>
> I've added a comment to the foreach construct, to make it clearer what
> it's doing.
>
> Matthew
>
> testsuite/
> 2015-12-07  Matthew Wahab  
>
>
> * lib/target-supports.exp (add_options_for_arm_v8_1a_neon): Update
> comment.  Use check_effetive_target_arm_v8_1a_neon_ok to select
> the command line options.
> (check_effective_target_arm_v8_1a_neon_ok_nocache): Update initial
> test to allow ARM targets.  Select and record a working set of
> command line options.
> (check_effective_target_arm_v8_1a_neon_hw): Add tests for ARM
> targets.
>


># Return 1 if the target supports the ARMv8.1 Adv.SIMD extension, 0
>-# otherwise.  The test is valid for AArch64.
>+# otherwise.  The test is valid for AArch64 and ARM.  Record the command
>+# line options that needed.

s/that//

Can you also make sure doc/sourcebuild.texi is updated for this helper
function ? If not documented,it would be good to add the documentation
for the same while you are here.

Ramana

Re: [PATCH, i386] Use scalar mask for 16-byte and 32-byte vectors when possible

2015-12-10 Thread Ilya Enkovich

Ping

2015-11-26 13:33 GMT+03:00 Ilya Enkovich :
> Hi,
>
> This patch allows usage of scalar masks for ymm and xmm registers when target 
> supports it.  Bootstrapped and regtested on x86_64-unknown-linux-gnu.  OK for 
> trunk?
>
> Thanks,
> Ilya
> --
> gcc/
>
> 2015-11-26  Ilya Enkovich  
>
> * config/i386/i386.c (ix86_get_mask_mode): Use scalar
> modes for 32 and 16 byte vectors when possible.
>
> gcc/testsuite/
>
> 2015-11-26  Ilya Enkovich  
>
> * gcc.dg/vect/vect-32-chars.c: New test.
>
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 83749d5..d7c359f 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -53443,7 +53443,8 @@ ix86_get_mask_mode (unsigned nunits, unsigned 
> vector_size)
>unsigned elem_size = vector_size / nunits;
>
>/* Scalar mask case.  */
> -  if (TARGET_AVX512F && vector_size == 64)
> +  if ((TARGET_AVX512F && vector_size == 64)
> +  || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
>  {
>if (elem_size == 4 || elem_size == 8 || TARGET_AVX512BW)
> return smallest_mode_for_size (nunits, MODE_INT);
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-32-chars.c 
> b/gcc/testsuite/gcc.dg/vect/vect-32-chars.c
> new file mode 100644
> index 000..0af5d2d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-32-chars.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mavx512bw -mavx512vl" { target { i?86-*-* 
> x86_64-*-* } } } */
> +
> +char a[32];
> +char b[32];
> +char c[32];
> +
> +void test()
> +{
> +  int i = 0;
> +  for (i = 0; i < 32; i++)
> +if (b[i] > 0)
> +  a[i] = c[i];
> +}
> +
> +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target { 
> i?86-*-* x86_64-*-* } } } } */

Re: Splitting up gcc/omp-low.c?

On Thu, Dec 10, 2015 at 12:26:10PM +0100, Bernd Schmidt wrote:
> On 12/10/2015 09:08 AM, Jakub Jelinek wrote:
> >On Wed, Dec 09, 2015 at 06:23:22PM +0100, Bernd Schmidt wrote:
> >>On 12/09/2015 05:24 PM, Thomas Schwinge wrote:
> >>>
> >>>In addition to that, how about we split up gcc/omp-low.c into several
> >>>files?  Would it make sense (I have not yet looked in detail) to do so
> >>>along the borders of the several passes defined therein?  Or, can you
> >>>tell already that there would be too many cross-references between the
> >>>several files to make this infeasible?
> >>
> >>It would be nice to get rid of all the code duplication in that file. That
> >>alone could reduce the size by quite a bit, and hopefully make it easier to
> >>read.
> >
> >What exact code duplication do you mean?
> 
> Functions that are near-identical with slight differences, or which have
> large sections of identical code. scan_omp_task vs scan_omp_parallel, or the

Even these two (quite short) have huge number of differences, so I'm not
100% sure it would be more readable if we had just one scan_omp_taskreg that
handled both.

> various expand_omp_for functions are examples.

I'm aware of some duplication in expand_omp_for_* functions, and some of the
obvious duplications were already moved to helper functions.  But in these
cases the number of differences is even significantly bigger too, so having
just one function that would handle all the different schedules would be far
less readable.  Perhaps we can add some small helpers to handle some little
pieces that repeat between the functions.

Jakub

Re: [PATCH] update -Wall and -Wextra documentation


On 12/10/2015 12:07 AM, Martin Sebor wrote:


* invoke.texi (Warning Options): Update -Wall options.  Clarify
when some -Wextra options are enabled.  Add -Wplacement-new example.


I tried to check this list against c.opt - I figure this should contain 
essentially the ones that are have an EnabledBy(Wall), plus whatever 
logic there is in c-opts.



+-Wduplicated-cond  @gol


I don't see this one as enabled by Wall, and

2015-10-02  Marek Polacek  

* c.opt (Wduplicated-cond): Don't enable by -Wall anymore.


+-Wplacement-new @r{(only for C++)} @gol


This one appears to be on by default?


+In C++, this warning is also enabled by @option{-Wall}.  In C, it is also
+enabled by @option{-Wextra}; to get the other warnings of @option{-Wextra}
+without this warning, use @option{-Wextra -Wno-sign-compare}.


Is the last part of the sentence really necessary? It kind of follows 
from the rest of the documentation and we don't spell this out for other 
-Wextra options.



Bernd

Re: [PATCH 6/7][ARM] Add ACLE intrinsics vqrdmlah and vqrdmlsh

On Mon, Dec 7, 2015 at 4:12 PM, Matthew Wahab
 wrote:
> Ping. Updated patch attached.
> Matthew
>
>
> On 26/11/15 16:04, Matthew Wahab wrote:
>>
>> Hello,
>>
>> This patch adds the ACLE intrinsics for the instructions introduced in
>> ARMv8.1. It adds the vqrmdlah and vqrdmlsh forms of the instrinsics to
>> the arm_neon.h header, together with the ARM builtins used to implement
>> them. The intrinsics are available when -march=armv8.1-a is enabled
>> together with appropriate fpu options.
>>
>> Tested the series for arm-none-eabi with cross-compiled check-gcc on an
>> ARMv8.1 emulator. Also tested arm-none-linux-gnueabihf with native
>> bootstrap and make check.
>>
>> Ok for trunk?
>> Matthew
>>
>> gcc/
>> 2015-11-26  Matthew Wahab  
>>
>>  * config/arm/arm_neon.h (vqrdmlah_s16, vqrdmlah_s32): New.
>>  (vqrdmlahq_s16, vqrdmlahq_s32): New.
>>  (vqrdmlsh_s16, vqrdmlsh_s32): New.
>>  (vqrdmlahq_s16, vqrdmlshq_s32): New.
>>  * config/arm/arm_neon_builtins.def: Add "vqrdmlah" and "vqrdmlsh".
>>
>


OK.

Ramana

Re: Splitting up gcc/omp-low.c?


On 12/10/2015 09:08 AM, Jakub Jelinek wrote:

On Wed, Dec 09, 2015 at 06:23:22PM +0100, Bernd Schmidt wrote:

On 12/09/2015 05:24 PM, Thomas Schwinge wrote:


In addition to that, how about we split up gcc/omp-low.c into several
files?  Would it make sense (I have not yet looked in detail) to do so
along the borders of the several passes defined therein?  Or, can you
tell already that there would be too many cross-references between the
several files to make this infeasible?


It would be nice to get rid of all the code duplication in that file. That
alone could reduce the size by quite a bit, and hopefully make it easier to
read.


What exact code duplication do you mean?


Functions that are near-identical with slight differences, or which have 
large sections of identical code. scan_omp_task vs scan_omp_parallel, or 
the various expand_omp_for functions are examples.



Bernd

Re: [testsuite][ARM target attributes] Fix effective_target tests

Hi Christophe,

On 08/12/15 11:18, Christophe Lyon wrote:

On 8 December 2015 at 11:50, Kyrill Tkachov wrote:

Hi Christophe,

On 27/11/15 13:00, Christophe Lyon wrote:

Hi,

After the recent commits from Christian adding target attributes
support for ARM FPU settings, I've noticed that some of the tests
were failing because of incorrect assumptions wrt to the default
cpu/fpu/float-abi of the compiler.

This patch fixes the problems I've noticed in the following way:
- do not force -mfloat-abi=softfp in dg-options, to avoid conflicts
when gcc is configured --with-float=hard

- change arm_vfp_ok such that it tries several -mfpu/-mfloat-abi
flags, checks that __ARM_FP is defined and __ARM_NEON_FP is not
defined

- introduce arm_fp_ok, which is similar but does not enforce fpu setting

- add a new effective_target: arm_crypto_pragma_ok to check that
setting this fpu via a pragma is actually supported by the current
"multilib". This is different from checking the command-line option
because the pragma might conflict with the command-line options in
use.

The updates in the testcases are as follows:
- attr-crypto.c, we have to make sure that the defaut fpu does not
conflict with the one forced by pragma. That's why I use the arm_vfp
options/effective_target. This is needed if gcc has been configured
--with-fpu=neon-fp16, as the pragma fpu=crypto-neon-fp-armv8 would
conflict.

- attr-neon-builtin-fail.c: use arm_fp to force the appropriate
float-abi setting. Enforcing fpu is not needed here.

- attr-neon-fp16.c: similar, I also removed arm_neon_ok since it was
not necessary to make the test pass in my testing. On second thought,
I'm wondering whether I should leave it and make the test unsupported
in more cases (such as when forcing -march=armv5t, although it does
pass with this patch)

- attr-neon2.c: use arm_vfp to force the appropriate float-abi
setting. Enforcing mfpu=vfp is needed to avoid conflict with the
pragma target fpu=neon (for instance if the toolchain default is
neon-fp16)

- attr-neon3.c: similar

Tested on a variety of configurations, see:

http://people.linaro.org/~christophe.lyon/cross-validation/gcc-test-patches/230929-target-attr/report-build-info.html

Note that the regressions reported fall into 3 categories:
- when forcing march=armv5t: tests are now unsupported because I
modified arm_crypto_ok to require arm_v8_neon_ok instead of arm32.

- the warning reported by attr-neon-builtin-fail.c moved from line 12
to 14 and is thus seen as a regression + one improvement

- finally, attr-neon-fp16.c causes an ICE on armeb compilers, for
which I need to post a bugzilla.

TBH, I'm a bit concerned by the complexity of all these multilib-like
conditions. I'm confident that I'm still missing some combinations :-)

And with new target attributes coming, new architectures etc... all
this logic is likely to become even more complex.

That being said, OK for trunk?

I'd like us to clean up and reorganise the gcc.target/arm testsuite
at some point to make it more robust with respect to the tons of multilib
options and configurations we can have for arm. It's becoming very
frustrating
to test feature-specific stuff :(

This is ok in the meantime.
Sorry for the delay.

Thanks for the approval, glad to see I'm not the only one willing to see
improvements in this area :)

Committed as r231403.

With this patch we're seeing legitimate PASSes go to NA.
For example:

gcc.target/arm/vfp-1.c
gcc.target/arm/vfp-ldmdbs.c
and other vfp tests in arm.exp.
This is, for example, for the
variant-mthumb/-march=armv8-a/-mfpu=crypto-neon-fp-armv8/-mfloat-abi=hard

I suspect under your new predicates they'd need to be changed to check for
arm_fp_ok rather than arm_vfp_ok.

We want to avoid removing any PASSes.
Can you please test some more to make sure we don't remove any legitimate
passes with your patch?

Also, Ramana reminded me offline that any new predicates in target-supports.exp
need documenting in sourcebuild.txt.

In light of that, can you please revert your patch and address the issues above
so that we can be confident we don't lose existing legitimate test coverage?

Sorry for not catching these sooner.
Kyrill

Christophe.

Thanks for handling this!
Kyrill

Christophe

2015-11-27 Christophe Lyon

* lib/target-supports.exp
(check_effective_target_arm_vfp_ok_nocache): New.
(check_effective_target_arm_vfp_ok): Call the new
check_effective_target_arm_vfp_ok_nocache function.
(check_effective_target_arm_fp_ok_nocache): New.
(check_effective_target_arm_fp_ok): New.
(add_options_for_arm_fp): New.
(check_effective_target_arm_crypto_ok_nocache): Require
target_arm_v8_neon_ok instead of arm32.
(check_effective_target_arm_crypto_pragma_ok_nocache): New.
(check_effective_target_arm_crypto_pragma_ok): New.
(add_options_for_arm_vfp): New.
* gcc.target/arm/attr-crypto.c: Use

Re: [PATCH, ARM] PR68674 Fix LTO support for neon builtins and error catching



On 10/12/15 10:11, Christian Bruel wrote:



On 12/10/2015 10:59 AM, Kyrill Tkachov wrote:


On 10/12/15 09:26, Christian Bruel wrote:

Hi Kyrill,

On 12/09/2015 06:32 PM, Kyrill Tkachov wrote:

Hi Christian,

On 08/12/15 12:53, Christian Bruel wrote:

Hi,

The order of the NEON builtins construction has led to complications since the 
attribute target support. This was not a problem when driven from the command 
line, but was causing various issues when the builtins was mixed between fpu
configurations or when used with LTO.

Firstly the builtin functions was not initialized before the parsing of 
functions, leading to wrong type initializations.

Then error catching code when a builtin was used without the proper fpu flags 
was incomprehensible for the user, for instance

#include "arm_neon.h"

int8x8_t a, b;
int16x8_t e;

void
main()
{
e = (int16x8_t)__builtin_neon_vaddlsv8qi (a, b);
}

compiled with default options (without -mfpu=neon -mfloat-abi=hard) gave pages 
of

/arm-none-eabi/6.0.0/include/arm_neon.h:39:9: error: unknown type name 
'__simd64_int8_t'
   typedef __simd64_int8_t int8x8_t;
...
...
arm_neon.h:4724:3: error: can't convert a vector of type 'poly64x2_t {aka 
__vector(4) int}' to type 'int' which has different size
 return (poly64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) 
__b, __c);
 ^~
...
... and one for each arm_neon.h lines..

by postponing the check into arm_expand_builtin, we now emit something more 
useful:

testo.c: In function 'main':
testo.c:9:7: error: '__builtin_neon_vaddlsv8qi' neon builtin is not supported 
in this configuration.
 e = (int16x8_t)__builtin_neon_vaddlsv8qi (a, b);
 ^~~

One small side effect to note: The total memory allocated is 370k bigger when 
neon is not used, so this support will have a follow-up to make their 
initialization lazy. But I'd like first to stabilize the stuff for stage3 (or 
get it
pre-approved if the memory is an issue)

tested without new failures with {,-mfpu=vfp,-mfpu=neon}{,-march=armv7-a\}
(a few tests that was fail are now unsupported)



I agree, the vector types (re)initialisation is a tricky part.
I've seen similar issues in the aarch64 work for target attributes

bool
arm_vector_mode_supported_p (machine_mode mode)
{
-  /* Neon also supports V2SImode, etc. listed in the clause below.  */
-  if (TARGET_NEON && (mode == V2SFmode || mode == V4SImode || mode == V8HImode
+  if (mode == V2SFmode || mode == V4SImode || mode == V8HImode
  || mode == V4HFmode || mode == V16QImode || mode == V4SFmode
-  || mode == V2DImode || mode == V8HFmode))
-return true;
-
-  if ((TARGET_NEON || TARGET_IWMMXT)
-  && ((mode == V2SImode)
-  || (mode == V4HImode)
-  || (mode == V8QImode)))
+  || mode == V2DImode || mode == V8HFmode
+  || mode == V2SImode || mode == V4HImode || mode == V8QImode)
return true;


So this allows vector modes unconditionally for all targets/fpu configurations?
I was tempted to do that in aarch64 when I was encountering similar issues.
In the end what worked for me was re-laying out the vector types in 
SET_CURRENT_FUNCTION
if necessary (https://gcc.gnu.org/ml/gcc-patches/2015-08/msg01084.html)


yes my assumption was that arm_init_neon_builtins () is now called for all 
targets, since the check is done at expand time and that the builtins need to 
be known by lto, with the vector type initialization, before they are expanded.
However at that time, lto streaming-in have not yet processed the attributes 
and TARGET_NEON is not set for the function.

I had a look at your re-layout, but I'm not sure. it feels like a hack. I think 
this should be solved first place during the builtin construction. Also 
set_current_function is too late, builtin_expand that will explode because of 
the
unknown modes.

But raise the point. In fact I was not really happy with this 
arm_vector_mode_supported_p neither as I was not sure about other contexts it 
can be called from and I cannot clearly claim that this change is always 
correct.



So the main usage of targetm.vector_mode_supported_p is in stor-layout.c and 
vector_type_mode in particular seems
to have a relevant comment:
   /* Vector types need to re-check the target flags each time we report
  the machine mode.  We need to do this because attribute target can
  change the result of vector_mode_supported_p and have_regs_of_mode
  on a per-function basis.  Thus the TYPE_MODE of a VECTOR_TYPE can
  change on a per-function basis.  */

I think that implies that it expects targetm.vector_mode_supported_p to reject 
vector modes in
contexts that don't support NEON...


yes, thanks for this clarification, that settles it. this part of my patch is 
rubbish :-)




I'd like to think about other way to set the vector modes from 
arm_init_neon_builtins before the target flags are known. I'm thinking about 
the lazy initialization at expand

[PATCH][Testsuite]Cleanup logs from gdb tests by adding newlines

2015-12-10 Thread Alan Lawrence

Runs of the guality testsuite can sometimes end up with gcc.log containing
malformed lines like:

A debugging session is active.PASS: gcc.dg/guality/pr36728-1.c   -O2  line 18 
arg4 == 4
A debugging session is active.PASS: gcc.dg/guality/restrict.c   -O2  line 30 
type:ip == int *
Inferior 1 [process 27054] will be killed.PASS: 
gcc.dg/guality/restrict.c   -O2  line 30 type:cicrp == const int * const 
restrict
Inferior 1 [process 27160] will be killed.PASS: 
gcc.dg/guality/restrict.c   -O2  line 30 type:cvirp == int * const volatile 
restrict

This patch just makes sure the PASS/FAIL comes at the beginning of a line.  (At
the slight cost of adding some extra newlines not in the actual test output.)

I moved the remote_close target calls earlier, to avoid any possible race
condition of extra output being generated after the newline - this may not be
strictly necessary.

Tested on aarch64-none-linux-gnu and x86_64-none-linux-gnu.

I think this is reasonable for stage 3 - OK for trunk?

gcc/testsuite/ChangeLog:
* lib/gcc-gdb-test.exp (gdb-test): call remote_close earlier, and send
newline to log, before calling pass/fail/unsupported.
* lib/gcc-simulate-thread.exp (simulate-thread): Likewise.
---
 gcc/testsuite/lib/gcc-gdb-test.exp| 15 ++-
 gcc/testsuite/lib/gcc-simulate-thread.exp | 10 +++---
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/lib/gcc-gdb-test.exp 
b/gcc/testsuite/lib/gcc-gdb-test.exp
index d3ba6e4..f60cabf 100644
--- a/gcc/testsuite/lib/gcc-gdb-test.exp
+++ b/gcc/testsuite/lib/gcc-gdb-test.exp
@@ -84,8 +84,9 @@ proc gdb-test { args } {
 remote_expect target [timeout_value] {
# Too old GDB
-re "Unhandled dwarf expression|Error in sourced command file|

Re: ipa-cp heuristics fixes

Hi,

thanks for looking into this, I only have one question:

On Thu, Dec 10, 2015 at 08:30:37AM +0100, Jan Hubicka wrote:
> Martin,
> while looking into the ipa-cp dumps for bzip and Firefox I noticed few issues.
> First of all, ipcp_cloning_candidate_p calls
>  optimize_function_for_speed_p (DECL_STRUCT_FUNCTION (node->decl))
> which can not be used at WPA time, becuase we have no DECL_STRUCT_FUNCTION
> around.  I replaced it by node->optimize_for_size_p ().
> 
> Second we perform incredible number of clones because we do obtain some sort 
> of
> polymorphic call context for them.  In wast majority of cases this is useless
> effort, because the functions in question do not contain virtual calls and do
> not pass the parameter further.  For firefox about 40k out of 50k clones
> created are created just because we found some context.
> 
> I changed the code to only clone if this immediately leads to 
> devirtualization.
> This do not cause any noticeable drop in number of devirtualized calls on
> Firefox. I suppose we will miss the case where cloning a caller may allow
> devirtualization in a clone of callee, but I do not think the heuristics for
> context independent values can handle this as implemented right now and it
> simply have way to many false positives.
> 
> What we can do is to devirtualize w/o cloning for local functions and
> speculatively devirtualize in case we would otherwise clone.
> 
> Third problem I noticed is that
> will_be_removed_from_program_if_no_direct_calls_p is used to decide if we can
> ignore the function size when deciding about the code size impact.
> This function is doing some analysis for inliner where it, for example, 
> analyses
> if a comdat which is going to be inlined consistently in the whole program
> will be removed.
> 
> In the cloning case I do not see this to apply: we have no evidence that the
> other units will pass the same constants to the function.  I think you
> basically want to assume that the  function will be removed if it has no
> address taken and it is not externally visibible. This is what local flag
> is for.
> 
> I gathered some stats:
> 
> number of clones for all contexts: 49948->11102
> number of clones: 4376->4383
> 
> good_cloning_opportunity_p is called about 70k times, I wonder if the
> thresholds are not simply set too high.  For example, inliner does about 300k
> inlines at Firefox.
> 
> number of param replacements: 13041-> 13056 + 5383 aggregate replacements (I 
> do not have data on unpatched tree for this)
> number of devirts: 956->933
> number of devirts happening at inline: 781->868
> number of indirect calls promoted: 512->512
> 
> Inliner stats from: Unit growth for small function inlining: 7965701->9130051 
> (14%)
> to: Unit growth for small function inlining: 7965010->9138577
> 
> So it seems that except for large drop in number of clones there is no 
> significant difference.
> 
> I am bootstrapping/regtesting this on x86_64-linux, does it seem OK?
> 
> Honza
> 
>   * ipa-cp.c (ipcp_cloning_candidate_p): Use node->optimize_for_size_p.
>   (good_cloning_opportunity_p): Likewise.
>   (gather_context_independent_values): Do not return true when
>   polymorphic call context is known or when we have known aggregate
>   value of unused parameter.
>   (estimate_local_effects): Try to create clone for all context
>   when either some params are substituted or devirtualization is possible
>   or some params can be removed; use local flag instead of
>   node->will_be_removed_from_program_if_no_direct_calls_p.
>   (identify_dead_nodes): Likewise.
> Index: ipa-cp.c
> ===
> --- ipa-cp.c  (revision 231477)
> +++ ipa-cp.c  (working copy)
> @@ -613,7 +613,7 @@ ipcp_cloning_candidate_p (struct cgraph_
>return false;
>  }
>  
> -  if (!optimize_function_for_speed_p (DECL_STRUCT_FUNCTION (node->decl)))
> +  if (node->optimize_for_size_p ())
>  {
>if (dump_file)
>  fprintf (dump_file, "Not considering %s for cloning; "
> @@ -2267,7 +2267,7 @@ good_cloning_opportunity_p (struct cgrap
>  {
>if (time_benefit == 0
>|| !opt_for_fn (node->decl, flag_ipa_cp_clone)
> -  || !optimize_function_for_speed_p (DECL_STRUCT_FUNCTION (node->decl)))
> +  || node->optimize_for_size_p ())
>  return false;
>  
>gcc_assert (size_cost > 0);
> @@ -2387,12 +2387,14 @@ gather_context_independent_values (struc
>   *removable_params_cost
> += ipa_get_param_move_cost (info, i);
>  
> +  if (!ipa_is_param_used (info, i))
> + continue;
> +

Is this really necessary, is it not enough to remove the assignment to
ret below?  If the parameter is not used, devirtualization time bonus,
which you then rely on estimate_local_effects, should be zero for it.

It is a very minor point, I suppose, but if the function gets cloned
for a different reason, it might still be beneficial to have as much

Re: [PATCH v4] Fix shrink-wrapping bug (PR67778, PR68634)


On 12/09/2015 09:47 PM, Segher Boessenkool wrote:

After shrink-wrapping has found the "tightest fit" for where to place
the prologue, it tries move it earlier (so that frame saves are run
earlier) -- but without copying any more basic blocks.

Unfortunately a candidate block we select can be inside a loop, and we
will still allow it (because the loop always exits via our previously
chosen block).  We can do that just fine if we make a duplicate of the
block, but we do not want to here.

So we need to detect this situation.  We can place the prologue at a
previous block PRE only if PRE dominates every block reachable from
it, because then we will never need to duplicate that block (it will
always be executed with prologue).

v4: Fixed all the stupid mistakes you noticed.  Also, the previous
version stopped looking when the previous try didn't work out.  This
version doesn't: it is simpler, more in line with the rest of the
algorithm, potentially useful, and doesn't really cost more.

Tested on the two testcases from the PRs.  Also regression checked
on powerpc64-linux.

Is this okay for trunk?


Ok. This seems like a safe way to test things. For gcc-7 it might be 
worthwhile to try to have loop information available and use that for 
efficiency.



Bernd

Re: [PATCH][ARM] PR 49526: Add support for smmul,smmla,smmls instructions



On 25/11/15 10:36, Kyrill Tkachov wrote:

Hi Yvan,

On 24/11/15 15:05, Yvan Roux wrote:

Hi Kyrill,

On 24 November 2015 at 14:31, Kyrill Tkachov  wrote:

Ping.

Thanks,
Kyrill



On 13/11/15 11:50, Kyrill Tkachov wrote:

Ping.
https://gcc.gnu.org/ml/gcc-patches/2015-11/msg00686.html

Thanks,
Kyrill

On 06/11/15 17:05, Kyrill Tkachov wrote:

Hi all,

This patch introduces support for the smmul, smmla and smmls instructions
that appear in armv6 architecture levels
and higher. To quote the SMMUL description from the ARMARM:
"Signed Most Significant Word Multiply multiplies two signed 32-bit
values, extracts the most significant 32 bits of
the result, and writes those bits to the destination register."

The smmla and smmls are the multiply-accumulate and multiply-subtract
extensions of that multiply.
There also exists an smmulr variant that rounds the result rather than
truncating it.
However, when I tried adding patterns for those forms I got an LRA ICE
that I was not able to figure out.
I'll try to find a testcase for that, but in the meantime there's no
reason to not have patterns for the
non-rounding variants.

I agree, but would be interested to give a look at the LRA issue when
you have a testcase.


I've filed PR 68536 for the ICE I've been seeing. Any insight is, of course, 
welcome.


Bootstrapped and tested on arm-none-linux-gnueabihf.
I've seen this trigger in quite a few places in SPEC2006 where it always
made the code better.

Ok for trunk?

Patch looks good to me (but can't approved it), I just wonder if
adding a subreg_highpart_p predicate in emit-rtl.c would be useful,
looking at other usage of subreg_highpart_offset it doesn't seem to be
the case, but maybe for consistency with the existing
subreg_lowpart_p.


Perhaps, we can put it there if there's much scope for its reuse in other
parts of the compiler. I didn't put it there in the first place so as not to
extend the review scope of the patch to the midend...



I have since discovered a bug in the proposed patterns for smmla and smmls.
The instructions are more involved than I thought, so it's not easy to write
a pattern for them.
I'll propose an updated patch adding just the smmul pattern in next stage1.

So, I'm withdrawing this patch.
thanks,
Kyrill


Thanks for looking at this,
Kyrill



Cheers,
Yvan


Thanks,
Kyrill

2015-11-06  Kyrylo Tkachov  

 PR target/49526
 * config/arm/arm.md (*mulsidi3si_v6): New pattern.
 (*mulsidi3siaddsi_v6): Likewise.
 (*mulsidi3sisubsi_v6): Likewise.
 * config/arm/predicates.md (subreg_highpart_operator):
 New predicate.

2015-11-06  Kyrylo Tkachov  

 PR target/49526
 * gcc.target/arm/pr49526_1.c: New test.
 * gcc.target/arm/pr49526_2.c: Likewise.
 * gcc.target/arm/pr49526_3.c: Likewise.

Re: [RFA] Transparent alias suport part 10: Fix base+offset alias analysis oracle WRT aliases

>   PR ipa/61886
>   PR middle-end/25140
>   * tree-ssa-alias.c (ptr_deref_may_alias_decl_p): Usecompare_base_decls
>   (nonoverlapping_component_refs_of_decl_p): Update sanity check.
>   (decl_refs_may_alias_p): Use compare_base_decls.
>   * alias.c: Include cgraph.h
>   (rtx_equal_for_memref_p): Use rtx_equal_for_memref_p.
>   (compare_base_decls): New function.
>   (base_alias_check): Likewise.
>   (memrefs_conflict_p): Likewise.
>   (nonoverlapping_memrefs_p): Likewise.
>   * alias.h (compare_base_decls): Declare.

The installed patch also contains the cut-off for the Ada testcases, so I have 
added the following line to the ChangeLong:

(get_alias_set): Add cut-off for recursion.

and installed the Ada testcases:

* gnat.dg/specs/access1.ads: New test.
* gnat.dg/specs/access2.ads: Likewise.

Thanks for quickly fixing the issue!

-- 
Eric Botcazou

Re: [PATCH 4/7][ARM] Add ACLE feature macro for ARMv8.1 instructions.

On Tue, Dec 8, 2015 at 7:45 AM, Christian Bruel  wrote:
> Hi Matthew,
>
>
> On 12/07/2015 05:07 PM, Matthew Wahab wrote:
>>
>> Ping. Updated patch attached.
>> Matthew
>>
>>
>> On 26/11/15 16:01, Matthew Wahab wrote:
>>>
>>> Hello,
>>>
>>> This patch adds the feature macro __ARM_FEATURE_QRDMX to indicate the
>>> presence of the ARMv8.1 instructions vqrdmlah and vqrdmlsh. It is
>>> defined when the instructions are available, as it is when
>>> -march=armv8.1-a is enabled with suitable fpu options.
>>>
>>> Tested the series for arm-none-eabi with cross-compiled check-gcc on an
>>> ARMv8.1 emulator. Also tested arm-none-linux-gnueabihf with native
>>> bootstrap and make check.
>>>
>>> Ok for trunk?
>>> Matthew
>>>
>>> gcc/
>>> 2015-11-26  Matthew Wahab  
>>>
>>>   * config/arm/arm-c.c (arm_cpu_builtins): Define
>>> __ARM_FEATURE_QRDMX.
>>>
>>
>
> +  if (TARGET_NEON_RDMA)
> +builtin_define ("__ARM_FEATURE_QRDMX");
> +
>
> Since it depends on TARGET_NEON, could you please use
>
>   def_or_undef_macro (pfile, "__ARM_FEATURE_QRDMX", TARGET_NEON_RDMA);
>
> instead ?

I think that's what it should be -

OK with that fixed.

Ramana


>
> thanks
>
> Christian

Re: [PATCH 1/7][ARM] Add support for ARMv8.1.

On Thu, Dec 10, 2015 at 10:43 AM, Ramana Radhakrishnan
 wrote:
> On Mon, Dec 7, 2015 at 4:04 PM, Matthew Wahab
>  wrote:
>> Ping. Updated patch attached.
>> Matthew
>>
>>
>> On 26/11/15 15:55, Matthew Wahab wrote:
>>>
>>> Hello,
>>>
>>>
>>> ARMv8.1 includes an extension to ARM which adds two Adv.SIMD
>>> instructions, vqrdmlah and vqrdmlsh. This patch set adds support for
>>> ARMv8.1 and for the new instructions, enabling the architecture with
>>> --march=armv8.1-a. The new instructions are enabled when both ARMv8.1
>>> and a suitable fpu options are set, for instance with -march=armv8.1-a
>>> -mfpu=neon-fp-armv8 -mfloat-abi=hard.
>>>
>>> This patch set adds the command line options and internal feature
>>> macros. Following patches
>>> - enable multilib support for ARMv8.1,
>>> - add patterns for the new instructions,
>>> - add the ACLE feature macro for the ARMv8.1 extensions,
>>> - extend target support in the testsuite to ARMv8.1,
>>> - add the ACLE intrinsics for vqrmdl{as}h and
>>> - add the ACLE intrinsics for vqrmdl{as}h_lane.
>>>
>>> Tested the series for arm-none-eabi with cross-compiled check-gcc on an
>>> ARMv8.1 emulator. Also tested arm-none-linux-gnueabihf with native
>>> bootstrap and make check.
>>>
>>> Is this ok for trunk?
>>> Matthew
>>>
>>> gcc/
>>> 2015-11-26  Matthew Wahab  
>>>
>>>  * config/arm/arm-arches.def: Add "armv8.1-a" and "armv8.1-a+crc".
>>>  * config/arm/arm-protos.h (FL2_ARCH8_1): New.
>>>  (FL2_FOR_ARCH8_1A): New.
>>>  * config/arm/arm-tables.opt: Regenerate.
>>>  * config/arm/arm.c (arm_arch8_1): New.
>>>  (arm_option_override): Set arm_arch8_1.
>>>  * config/arm/arm.h (TARGET_NEON_RDMA): New.
>>>  (arm_arch8_1): Declare.
>>>  * doc/invoke.texi (ARM Options, -march): Add "armv8.1-a" and
>>>  "armv8.1-a+crc".
>>>  (ARM Options, -mfpu): Fix a typo.
>>
>>
>
> OK.

I couldn't find 0/7 but in addition here you need to update the output
for TAG_FP_SIMD_Arch to be 4.

regards
Ramana

Re: [PATCH] Avoid false vector mask conversion

On Fri, Dec 4, 2015 at 4:00 PM, Ilya Enkovich  wrote:
> On 02 Dec 16:27, Richard Biener wrote:
>> On Wed, Dec 2, 2015 at 4:24 PM, Ilya Enkovich  wrote:
>> >
>> > The problem is that conversion is supposed to be handled by
>> > vectorizable_conversion,
>> > but it fails to because it is not actually a conversion. I suppose it
>> > may be handled
>> > in vectorizable_assignment but I chose this pattern because it's meant
>> > to handle mask
>> > conversion issues.
>>
>> I think it's always better to avoid patterns if you can.
>>
>> Richard.
>>
>
> Here is a variant with vectorizable_assignment change.  Bootstrapped and 
> regtested on x86_64-unknown-linux-gnu.  Does it look better?

Yes.

Thanks,
Richard.

> Thanks,
> Ilya
> --
> gcc/
>
> 2015-12-04  Ilya Enkovich  
>
> * tree-vect-stmts.c (vectorizable_assignment): Support
> useless boolean conversion.
>
>
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 3b078da..2cdbb04 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -4229,7 +4229,12 @@ vectorizable_assignment (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>/* But a conversion that does not change the bit-pattern is ok.  */
>&& !((TYPE_PRECISION (TREE_TYPE (scalar_dest))
> > TYPE_PRECISION (TREE_TYPE (op)))
> -  && TYPE_UNSIGNED (TREE_TYPE (op
> +  && TYPE_UNSIGNED (TREE_TYPE (op)))
> +  /* Conversion between boolean types of different sizes is
> +a simple assignment in case their vectypes are same
> +boolean vectors.  */
> +  && (!VECTOR_BOOLEAN_TYPE_P (vectype)
> + || !VECTOR_BOOLEAN_TYPE_P (vectype_in)))
>  {
>if (dump_enabled_p ())
>  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,

Re: PING^1: [PATCH] Add TYPE_EMPTY_RECORD for C++ empty class

On Wed, Dec 9, 2015 at 10:31 PM, Markus Trippelsdorf
 wrote:
> On 2015.12.09 at 10:53 -0800, H.J. Lu wrote:
>>
>> Empty C++ class is a corner case which isn't covered in psABI nor C++ ABI.
>> There is no mention of "empty record" in GCC documentation.  But there are
>> plenty of "empty class" in gcc/cp.  This change affects all targets.  C++ ABI
>> should specify how it should be passed.
>
> There is a C++ ABI mailinglist, where you could discuss this issue:
> http://sourcerytools.com/cgi-bin/mailman/listinfo/cxx-abi-dev

Yep.  As long as the ABI doesn't state how to pass those I'd rather _not_ change
GCCs way.

Richard.

> --
> Markus

Re: [PATCH] rtlanal: Fix bits/bytes confusion in set_noop_p (PR68814)

> My case comes from the gcc.dg/pr65492-2.c, the "test1int2" case.
> combine has made an insn
>   modifying insn i321: zero_extract(%3:DI,0x20,0)=%3:DI
> which is splatting the SImode parameter to both the high and low halves
> of the dest reg.  Then, it tries to combine it with the USE of that reg
> at the end of the function, giving
> 
> Failed to match this instruction:
> (parallel [
> (use (ior:DI (and:DI (reg/i:DI 3 3)
> (const_int 4294967295 [0x]))
> (ashift:DI (reg:DI 3 3 [ c ])
> (const_int 32 [0x20]
> (set (zero_extract:DI (reg/i:DI 3 3)
> (const_int 32 [0x20])
> (const_int 0 [0]))
> (reg:DI 3 3 [ c ]))
> ])
> 
> and if it has a parallel of two which doesn't match, it sees if it
> just needs one arm because the other is a noop set, and that ends
> up with
>   deleting noop move 21
> because of the wrong test, making the testcase fail.
> (powerpc64le has BITS_BIG_ENDIAN set, a bit unusual).

Thanks.  It seems a little odd for the condition to test the POS (operand #2) 
and not also the LEN (operand #1) of the ZERO_EXTRACT before returning true, 
but I'm not sure what the test would be given the above example.  Or maybe 
it's implicitly contained in the POS test because of the little-endian case.

The patch is OK for mainline.

-- 
Eric Botcazou

Re: [AArch64] Emit square root using the Newton series



On 09/12/15 18:50, Evandro Menezes wrote:

On 12/09/2015 11:16 AM, Kyrill Tkachov wrote:


On 09/12/15 17:02, Kyrill Tkachov wrote:


On 09/12/15 16:59, Evandro Menezes wrote:

On 12/09/2015 10:52 AM, Kyrill Tkachov wrote:

Hi Evandro,

On 08/12/15 21:35, Evandro Menezes wrote:

Emit square root using the Newton series

   2015-12-03  Evandro Menezes 

   gcc/
* config/aarch64/aarch64-protos.h (aarch64_emit_swsqrt):
   Declare new
function.
* config/aarch64/aarch64-simd.md (sqrt2): New
   expansion and
insn definitions.
* config/aarch64/aarch64-tuning-flags.def
(AARCH64_EXTRA_TUNE_FAST_SQRT): New tuning macro.
* config/aarch64/aarch64.c (aarch64_emit_swsqrt): Define
   new function.
* config/aarch64/aarch64.md (sqrt2): New expansion
   and insn
definitions.
* config/aarch64/aarch64.opt (mlow-precision-recip-sqrt):
   Expand option
description.
* doc/invoke.texi (mlow-precision-recip-sqrt): Likewise.

This patch extends the patch that added support for implementing x^-1/2 using 
the Newton series by adding support for x^1/2 as well.

Is it OK at this point of stage 3?



A comment on the patch itself from me...

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index 6f7dbce..11c6c9a 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -30,4 +30,4 @@

 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
 AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
-
+AARCH64_EXTRA_TUNING_OPTION ("fast_sqrt", FAST_SQRT)

That seems like a misleading name to me.
If we're doing this, that means that the sqrt instruction is not faster
than doing the inverse sqrt estimation followed by a multiply.
I think a name like "synth_sqrt" or "estimate_sqrt" or something along those 
lines
is more appropriate.


Unfortunately, this is the case on Exynos M1: the series is faster than the instruction. 
:-(  So, other targets when this is also true, using the "fast_sqrt" option 
might make sense.



Sure, but the way your patch is written, we will emit the series when 
"fast_sqrt" is set, rather
than the other way around, unless I'm misreading the logic in:



Sorry, what I meant to say is it would be clearer, IMO, to describe the 
compiler action that is being taken
(e.g. the rename_fma_regs tuning flag), in this case it's estimating sqrt using 
a series.

Kyrill


diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 030a101..f6d2da4 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4280,7 +4280,23 @@

 ;; sqrt

-(define_insn "sqrt2"
+(define_expand "sqrt2"
+  [(set (match_operand:VDQF 0 "register_operand")
+(sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if ((AARCH64_EXTRA_TUNE_FAST_SQRT & aarch64_tune_params.extra_tuning_flags)
+  && !optimize_function_for_size_p (cfun)
+  && flag_finite_math_only
+  && !flag_trapping_math
+  && flag_unsafe_math_optimizations)
+{
+  aarch64_emit_swsqrt (operands[0], operands[1]);
+  DONE;
+}
+})



Kyrill,

How about "approx_sqrt" for, you guessed it, approximate square root?  The same adjective 
would perhaps describe "recip_sqrt" better too.



Sounds good to me.
Sorry for the bikeshedding.

Kyrill


Thanks,

Re: [PATCH 7/7][ARM] Add ACLE intrinsics vqrdmlah_lane and vqrdmlsh_lane

On Thu, Nov 26, 2015 at 4:04 PM, Matthew Wahab
 wrote:
> Hello,
>
> This patch adds the ACLE intrinsics for the instructions introduced in
> ARMv8.1. It adds the vqrmdlah_lane and vqrdmlsh_lane forms of the
> instrinsics to the arm_neon.h header, together with the ARM builtins
> used to implement them. The intrinsics are available when
> -march=armv8.1-a is enabled together with appropriate fpu options.
>
> Tested the series for arm-none-eabi with cross-compiled check-gcc on an
> ARMv8.1 emulator. Also tested arm-none-linux-gnueabihf with native
> bootstrap and make check.
>
> Ok for trunk?
> Matthew
>
> gcc/
> 2015-11-26  Matthew Wahab  
>
> * config/arm/arm_neon.h (vqrdmlahq_lane_s16): New.
> (vqrdmlahq_lane_s32): New.
> (vqrdmlah_lane_s16): New.
> (vqrdmlah_lane_s32): New.
> (vqrdmlshq_lane_s16): New.
> (vqrdmlshq_lane_s32): New.
> (vqrdmlsh_lane_s16): New.
> (vqrdmlsh_lane_s32): New.
> * config/arm/arm_neon_builtins.def: Add "vqrdmlah_lane" and
> "vqrdmlsh_lane".
>

OK.

Ramana

Re: [hsa 3/10] HSA libgomp plugin [part 2/2]

2015-12-10 Thread Martin Liška

On 12/09/2015 01:15 PM, Jakub Jelinek wrote:
> On Mon, Dec 07, 2015 at 12:20:49PM +0100, Martin Jambor wrote:
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include "libgomp-plugin.h"
>> +#include "gomp-constants.h"
>> +#include "hsa.h"
>> +#include "hsa_ext_finalize.h"
> 
> If these 2 headers are coming from outside of gcc, better use <> for them
> instead of "", and better place them above the libgomp specific ones.
> 
>> +#include "dlfcn.h"
> 
> Ditto.
> 
>> +/* Flag to decide whether print to stderr information about what is going 
>> on.
>> +   Set in init_debug depending on environment variables.  */
>> +
>> +static bool debug;
>> +
>> +/* Flag to decide if the runtime should suppress a possible fallback to host
>> +   execution.  */
>> +
>> +static bool suppress_host_fallback;
>> +
>> +/* Initialize debug and suppress_host_fallback according to the 
>> environment.  */
>> +
>> +static void
>> +init_enviroment_variables (void)
>> +{
>> +  if (getenv ("HSA_DEBUG"))
>> +debug = true;
>> +  else
>> +debug = false;
>> +
>> +  if (getenv ("HSA_SUPPRESS_HOST_FALLBACK"))
>> +suppress_host_fallback = true;
>> +  else
>> +suppress_host_fallback = false;
> 
> Wonder whether we want these custom env vars in suid programs, allowing
> users to change behavior might be undesirable.  So perhaps use secure_getenv
> instead of getenv if found by configure?
>> +
>> +  const char* hsa_error;
> 
> Space before * instead of after it (multiple spots).
> 
>> +  hsa_status_string (status, _error);
>> +
>> +  unsigned l = strlen (hsa_error);
>> +
>> +  char *err = GOMP_PLUGIN_malloc (sizeof (char) * l);
> 
> sizeof (char) is 1, please don't multiply by it, that is just confusing.
> It makes sense in macros where you don't know what the type really is, but
> not here.
> 
>> +  memcpy (err, hsa_error, l - 1);
>> +  err[l] = '\0';
>> +
>> +  fprintf (stderr, "HSA warning: %s (%s)\n", str, err);
> 
> Why do you allocate err at all, if you free it right away?  Can't you use
> hsa_error directly instead?
>> +
>> +  free (err);
> 
>> +static void
>> +hsa_fatal (const char *str, hsa_status_t status)
>> +{
>> +  const char* hsa_error;
>> +  hsa_status_string (status, _error);
>> +  GOMP_PLUGIN_fatal ("HSA fatal error: %s (%s)", str, hsa_error);
> 
> Like you do e.g. here?
> 
>> +/* Callback of dispatch queues to report errors.  */
>> +
>> +static void
>> +queue_callback(hsa_status_t status, hsa_queue_t* queue __attribute__ 
>> ((unused)),
> 
> Missing space before (.
>> +/* Callback of hsa_agent_iterate_regions.  Determine if a memory REGION can 
>> be
>> +   used for kernarg allocations and if so write it to the memory pointed to 
>> by
>> +   DATA and break the query.  */
>> +
>> +static hsa_status_t get_kernarg_memory_region (hsa_region_t region, void* 
>> data)
> 
> Missing newline between return type and function name.
> 
>> +  hsa_region_t* ret = (hsa_region_t*) data;
> 
> Two spots with wrong formatting above.
>> +{
>> +  if (agent->first_module)
>> +  agent->first_module->prev = module;
> 
> Wrong indentation.
> 
>> +  unsigned size = end - start;
>> +  char *buf = (char *) malloc (size);
>> +  memcpy (buf, start, size);
> 
> malloc could return NULL and you crash.  Should this use GOMP_PLUGIN_malloc
> instead?
> 
>> +  struct GOMP_hsa_kernel_dispatch *shadow = GOMP_PLUGIN_malloc_cleared
>> +(sizeof (struct GOMP_hsa_kernel_dispatch));
> 
> Formatting, = should go on the next line, and if even that doesn't help,
> maybe use sizeof (*shadow)?
>> +
>> +  shadow->queue = agent->command_q;
>> +  shadow->omp_data_memory = omp_data_size > 0
>> +? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
> 
> = should go on the next line.
> 
>> +  unsigned dispatch_count = kernel->dependencies_count;
>> +  shadow->kernel_dispatch_count = dispatch_count;
>> +
>> +  shadow->children_dispatches = GOMP_PLUGIN_malloc
>> +(dispatch_count * sizeof (struct GOMP_hsa_kernel_dispatch *));
> 
> Likewise.
>> +indent_stream (FILE *f, unsigned indent)
>> +{
>> +  for (int i = 0; i < indent; i++)
>> +fputc (' ', f);
> 
> Perhaps fprintf (f, "%*s", indent, "");
> instead?
> 
>> +  struct GOMP_hsa_kernel_dispatch *shadow = create_single_kernel_dispatch
>> +(kernel, omp_data_size);
> 
> Formatting issues again.
> 
>> +  shadow->omp_num_threads = 64;
>> +  shadow->debug = 0;
>> +  shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
>> +
>> +  /* Create kernel dispatch data structures.  We do not allow to have
>> + a kernel dispatch with depth bigger than one.  */
>> +  for (unsigned i = 0; i < kernel->dependencies_count; i++)
>> +{
>> +  struct kernel_info *dependency = get_kernel_for_agent
>> +(kernel->agent, kernel->dependencies[i]);
>> +  shadow->children_dispatches[i] = create_single_kernel_dispatch
>> +(dependency, omp_data_size);
>> +  shadow->children_dispatches[i]->queue =
> 
> Never put = at the end of line.
>> +kernel->agent->kernel_dispatch_command_q;
> 
>

Re: [hsa 3/10] HSA libgomp plugin [part 1/2]

2015-12-10 Thread Martin Liška

On 12/09/2015 01:15 PM, Jakub Jelinek wrote:
> On Mon, Dec 07, 2015 at 12:20:49PM +0100, Martin Jambor wrote:
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +#include "libgomp-plugin.h"
>> +#include "gomp-constants.h"
>> +#include "hsa.h"
>> +#include "hsa_ext_finalize.h"
> 
> If these 2 headers are coming from outside of gcc, better use <> for them
> instead of "", and better place them above the libgomp specific ones.
> 
>> +#include "dlfcn.h"
> 
> Ditto.
> 
>> +/* Flag to decide whether print to stderr information about what is going 
>> on.
>> +   Set in init_debug depending on environment variables.  */
>> +
>> +static bool debug;
>> +
>> +/* Flag to decide if the runtime should suppress a possible fallback to host
>> +   execution.  */
>> +
>> +static bool suppress_host_fallback;
>> +
>> +/* Initialize debug and suppress_host_fallback according to the 
>> environment.  */
>> +
>> +static void
>> +init_enviroment_variables (void)
>> +{
>> +  if (getenv ("HSA_DEBUG"))
>> +debug = true;
>> +  else
>> +debug = false;
>> +
>> +  if (getenv ("HSA_SUPPRESS_HOST_FALLBACK"))
>> +suppress_host_fallback = true;
>> +  else
>> +suppress_host_fallback = false;
> 
> Wonder whether we want these custom env vars in suid programs, allowing
> users to change behavior might be undesirable.  So perhaps use secure_getenv
> instead of getenv if found by configure?
>> +
>> +  const char* hsa_error;
> 
> Space before * instead of after it (multiple spots).
> 
>> +  hsa_status_string (status, _error);
>> +
>> +  unsigned l = strlen (hsa_error);
>> +
>> +  char *err = GOMP_PLUGIN_malloc (sizeof (char) * l);
> 
> sizeof (char) is 1, please don't multiply by it, that is just confusing.
> It makes sense in macros where you don't know what the type really is, but
> not here.
> 
>> +  memcpy (err, hsa_error, l - 1);
>> +  err[l] = '\0';
>> +
>> +  fprintf (stderr, "HSA warning: %s (%s)\n", str, err);
> 
> Why do you allocate err at all, if you free it right away?  Can't you use
> hsa_error directly instead?
>> +
>> +  free (err);
> 
>> +static void
>> +hsa_fatal (const char *str, hsa_status_t status)
>> +{
>> +  const char* hsa_error;
>> +  hsa_status_string (status, _error);
>> +  GOMP_PLUGIN_fatal ("HSA fatal error: %s (%s)", str, hsa_error);
> 
> Like you do e.g. here?
> 
>> +/* Callback of dispatch queues to report errors.  */
>> +
>> +static void
>> +queue_callback(hsa_status_t status, hsa_queue_t* queue __attribute__ 
>> ((unused)),
> 
> Missing space before (.
>> +/* Callback of hsa_agent_iterate_regions.  Determine if a memory REGION can 
>> be
>> +   used for kernarg allocations and if so write it to the memory pointed to 
>> by
>> +   DATA and break the query.  */
>> +
>> +static hsa_status_t get_kernarg_memory_region (hsa_region_t region, void* 
>> data)
> 
> Missing newline between return type and function name.
> 
>> +  hsa_region_t* ret = (hsa_region_t*) data;
> 
> Two spots with wrong formatting above.
>> +{
>> +  if (agent->first_module)
>> +  agent->first_module->prev = module;
> 
> Wrong indentation.
> 
>> +  unsigned size = end - start;
>> +  char *buf = (char *) malloc (size);
>> +  memcpy (buf, start, size);
> 
> malloc could return NULL and you crash.  Should this use GOMP_PLUGIN_malloc
> instead?
> 
>> +  struct GOMP_hsa_kernel_dispatch *shadow = GOMP_PLUGIN_malloc_cleared
>> +(sizeof (struct GOMP_hsa_kernel_dispatch));
> 
> Formatting, = should go on the next line, and if even that doesn't help,
> maybe use sizeof (*shadow)?
>> +
>> +  shadow->queue = agent->command_q;
>> +  shadow->omp_data_memory = omp_data_size > 0
>> +? GOMP_PLUGIN_malloc (omp_data_size) : NULL;
> 
> = should go on the next line.
> 
>> +  unsigned dispatch_count = kernel->dependencies_count;
>> +  shadow->kernel_dispatch_count = dispatch_count;
>> +
>> +  shadow->children_dispatches = GOMP_PLUGIN_malloc
>> +(dispatch_count * sizeof (struct GOMP_hsa_kernel_dispatch *));
> 
> Likewise.
>> +indent_stream (FILE *f, unsigned indent)
>> +{
>> +  for (int i = 0; i < indent; i++)
>> +fputc (' ', f);
> 
> Perhaps fprintf (f, "%*s", indent, "");
> instead?
> 
>> +  struct GOMP_hsa_kernel_dispatch *shadow = create_single_kernel_dispatch
>> +(kernel, omp_data_size);
> 
> Formatting issues again.
> 
>> +  shadow->omp_num_threads = 64;
>> +  shadow->debug = 0;
>> +  shadow->omp_level = kernel->gridified_kernel_p ? 1 : 0;
>> +
>> +  /* Create kernel dispatch data structures.  We do not allow to have
>> + a kernel dispatch with depth bigger than one.  */
>> +  for (unsigned i = 0; i < kernel->dependencies_count; i++)
>> +{
>> +  struct kernel_info *dependency = get_kernel_for_agent
>> +(kernel->agent, kernel->dependencies[i]);
>> +  shadow->children_dispatches[i] = create_single_kernel_dispatch
>> +(dependency, omp_data_size);
>> +  shadow->children_dispatches[i]->queue =
> 
> Never put = at the end of line.
>> +kernel->agent->kernel_dispatch_command_q;
> 
>

[PATCH] gcc: read -fdebug-prefix-map OLD from environment (improved reproducibility)

2015-12-10 Thread Daniel Kahn Gillmor

Work on the reproducible-builds project [0] has identified that build
paths are one cause of output variation between builds.  This
changeset allows users to avoid this variation when building C objects
with debug symbols, while leaving the default behavior unchanged.

Background
--

gcc includes the build path in any generated DWARF debugging symbols,
specifically in DW_AT_comp_dir, but allows the embedded path to be
changed via -fdebug-prefix-map.

When -fdebug-prefix-map is used with the current build path, it
removes the build path from DW_AT_comp_dir but places it instead in
DW_AT_producer, so the reproducibility problem isn't resolved.

When building software for binary redistribution, the actual build
path on the build machine is irrelevant, and doesn't need to be
exposed in the debug symbols.

Resolution
--

This patch extends the first argument to -fdebug-prefix-map ("old") to
be able to read from the environment, which allows a packager to avoid
embedded build paths in the debugging symbols with something like:

  export SOURCE_BUILD_DIR="$(pwd)"
  gcc -fdebug-prefix-map=\$SOURCE_BUILD_DIR=/usr/src

Details
---

Specifically, if the first character of the "old" argument is a
literal $, then gcc will treat it as an environment variable name, and
use the value of the env var for prefix mapping.

As a result, DW_AT_producer contains the literal envvar name,
DW_AT_comp_dir contains the transformed build path, and the actual
build path is not at all present in the generated object file.

This has been tested successfully on amd64 machines, and i see no
reason why it would be platform-specific.

More discussion of alternate approaches considered and discarded in
the development of this change can be found at [1] for those
interested.

Feedback welcome!

[0] https://reproducible-builds.org
[1] 
https://lists.alioth.debian.org/pipermail/reproducible-builds/Week-of-Mon-20151130/004051.html
---
 gcc/doc/invoke.texi |  4 +++-
 gcc/final.c | 27 +--
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5256031..234432f 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -6440,7 +6440,9 @@ link processing time.  Merging is enabled by default.
 @item -fdebug-prefix-map=@var{old}=@var{new}
 @opindex fdebug-prefix-map
 When compiling files in directory @file{@var{old}}, record debugging
-information describing them as in @file{@var{new}} instead.
+information describing them as in @file{@var{new}} instead.  If
+@file{@var{old}} starts with a @samp{$}, the corresponding environment
+variable will be dereferenced, and its value will be used instead.
 
 @item -fno-dwarf2-cfi-asm
 @opindex fdwarf2-cfi-asm
diff --git a/gcc/final.c b/gcc/final.c
index 8cb5533..bc43b61 100644
--- a/gcc/final.c
+++ b/gcc/final.c
@@ -1525,6 +1525,9 @@ add_debug_prefix_map (const char *arg)
 {
   debug_prefix_map *map;
   const char *p;
+  char *env;
+  const char *old;
+  size_t oldlen;
 
   p = strchr (arg, '=');
   if (!p)
@@ -1532,9 +1535,29 @@ add_debug_prefix_map (const char *arg)
   error ("invalid argument %qs to -fdebug-prefix-map", arg);
   return;
 }
+  if (*arg == '$')
+{
+  env = xstrndup (arg+1, p - (arg+1));
+  old = getenv(env);
+  if (!old)
+   {
+ warning (0, "environment variable %qs not set in argument to "
+  "-fdebug-prefix-map", env);
+ free(env);
+ return;
+   }
+  oldlen = strlen(old);
+  free(env);
+}
+  else
+{
+  old = xstrndup (arg, p - arg);
+  oldlen = p - arg;
+}
+
   map = XNEW (debug_prefix_map);
-  map->old_prefix = xstrndup (arg, p - arg);
-  map->old_len = p - arg;
+  map->old_prefix = old;
+  map->old_len = oldlen;
   p++;
   map->new_prefix = xstrdup (p);
   map->new_len = strlen (p);
-- 
2.6.2

Re: [RFA] [PATCH] [PR tree-optimization/68619] Avoid direct cfg cleanups in tree-ssa-dom.c [4/3] v2


On 12/10/2015 10:05 AM, Uros Bizjak wrote:

Finally the mechanical changes necessary due to the API change in the walker.


You forgot to change the graphite part, as in the attached patch.

2015-12-10  Uros Bizjak  

 PR tree-optimization/68619
 * graphite-scop-detection.c (gather_bbs::before_dom_children):
 Change return type to an edge.  Always return NULL.

OK for mainline after successful bootstrap and regtest?

Yes.

Sorry about that.

jeff

[google][gcc-4_9] update hardreg costs only when conflict_costs[] < 0

2015-12-10 Thread Wei Mi

Hi,

The patch is for google branch only.

In r216697, when a hardreg is assigned to an allocno, a positive cost
will be added to those conflict allocnos to reflect the disfavor of
the hardreg.

However, the fact that conflict allocno disfavors a hard_regno doesn't
necessarily mean current allocno should prefer the hard_regno, so it
is incorrect to update the costs of an allocno directly according to
its conflict allocnos. The patch changes the code to update costs[i]
of an allocno only when conflict_costs[i] < 0, .i.e, when conflict
allocno prefer hardreg i.

Another issue is the costs of an allocno is updated only when the
conflict allocno is not marked as may_be_spilled_p. However, even if a
conflict allocno is marked as may_be_spilled_p right now, it still has
high probablity to get colored later. It is not right to ignore the
preferences from those conflict allocnos marked as may_be_spilled_p.
The patch changes it.

Test:
Gcc unit tests ok.
Minor improvement for google internal benchmarks.

Thanks,
Wei.
gcc/ChangeLog:

2015-12-10  Wei Mi  

* ira-color.c (restore_costs_from_conflicts): Don't record the
cost change.
(update_conflict_hard_regno_costs): Update costs[i] only when
conflict_costs[i] < 0.
(assign_hard_reg): Ditto.


Index: gcc/ira-color.c
===
--- gcc/ira-color.c (revision 231143)
+++ gcc/ira-color.c (working copy)
@@ -1588,9 +1588,11 @@ restore_costs_from_conflicts (ira_allocn
prev = curr;
}
  /* Propagate the disfavor of hardreg from conflict_a to the
-allocnos connecting with conflict_a via copies.  */
+allocnos connecting with conflict_a via copies.
+Note: once the hardreg is assigned to a, it will not be
+changed, so we don't need to record this change. */
  update_costs_from_allocno (conflict_a, hardreg,
-1, false, true, true);
+1, false, false, true);
}
 }
 }
@@ -1601,7 +1603,7 @@ restore_costs_from_conflicts (ira_allocn
update increases chances to remove some copies.  */
 static void
 update_conflict_hard_regno_costs (int *costs, enum reg_class aclass,
- bool decr_p)
+ bool decr_p, bool conflict)
 {
   int i, cost, class_size, freq, mult, div, divisor;
   int index, hard_regno;
@@ -1682,7 +1684,16 @@ update_conflict_hard_regno_costs (int *c
cont_p = true;
if (decr_p)
  cost = -cost;
-   costs[index] += cost;
+   /* conflict being true indicates this is updating costs[]
+  according to preferences of allocnos connected by copies
+  to the conflict allocnos.
+  The fact conflict allocno disfavors hard_regno doesn't
+  necessarily mean current allocno should prefer hard_regno
+  (actually only a little), so we update costs[] only
+  when conflict allocno prefers hard_regno, .i.e, when
+  conflict_costs[i] < 0. */
+   if (conflict && conflict_costs [i] < 0)
+ costs[index] += cost;
  }
  }
/* Probably 5 hops will be enough.  */
@@ -1934,7 +1945,6 @@ assign_hard_reg (ira_allocno_t a, bool r
}
}
  else if (! retry_p
-  && ! ALLOCNO_COLOR_DATA (conflict_a)->may_be_spilled_p
   /* Don't process the conflict allocno twice.  */
   && (ALLOCNO_COLOR_DATA (conflict_a)->last_process
   != curr_allocno_process))
@@ -1967,7 +1977,13 @@ assign_hard_reg (ira_allocno_t a, bool r
  ->new_conflict_hard_regs,
  hard_regno))
  continue;
-   full_costs[j] -= conflict_costs[k];
+   /* The fact conflict_a disfavors hard_regno doesn't
+  necessarily mean current allocno should prefer
+  hard_regno so much (only a little), so we only
+  update full_costs[] when conflict_a prefers
+  hard_regno, .i.e, when conflict_costs[k] < 0. */
+   if (conflict_costs[k] < 0)
+ full_costs[j] -= conflict_costs[k];
  }
  queue_update_cost (conflict_a, NULL, COST_HOP_DIVISOR);
 
@@ -1977,7 +1993,7 @@ assign_hard_reg (ira_allocno_t a, bool r
   if (! retry_p)
 /* Take into account preferences of allocnos connected by copies to
the conflict allocnos.  */
-update_conflict_hard_regno_costs (full_costs, aclass, true);
+update_conflict_hard_regno_costs (full_costs, aclass, true, true);
 
   /* Take preferences of allocnos connected

Re: [v3] avoid alignment of static variables affecting stack's


On 12/10/2015 05:07 PM, Jan Beulich wrote:

If not reaching

   if (TREE_CODE (origvar) == SSA_NAME)
 {
   gcc_assert (TREE_CODE (var) != VAR_DECL
  || (!DECL_EXTERNAL (var)
  && !DECL_HAS_VALUE_EXPR_P (var)
  && !TREE_STATIC (var)
  && TREE_TYPE (var) != error_mark_node
  && !DECL_HARD_REGISTER (var)
  && really_expand));
 }

in that case is not a problem, this would make for even simpler a
change. Please let me know.


I think that's fine. BTW there's a is_global_var predicate that 
{c,sh}ould be used for your test.



Bernd

Re: [hsa 1/10] Configury changes and new options

Hi,

On Tue, Dec 08, 2015 at 10:43:15PM +, Richard Sandiford wrote:
> [Sorry for the low-quality review, was just reading out of interest...]
> 
> Martin Jambor  writes:
> > +If you configure GCC with HSA offloading but do not have the HSA
> > +run-time library installed in a standard location then you can
> > +explicitely specify the directory where they are installed.  The
> 
> typo: explicitly

oops.  For some reason, my spell-checker accepts this typo.  I will
fix it.

> 
> > diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
> > index e4772d1..5609207 100644
> > --- a/gcc/lto-wrapper.c
> > +++ b/gcc/lto-wrapper.c
> > @@ -745,6 +745,11 @@ compile_images_for_offload_targets (unsigned in_argc, 
> > char *in_argv[],
> >offload_names = XCNEWVEC (char *, num_targets + 1);
> >for (unsigned i = 0; i < num_targets; i++)
> >  {
> > +  /* HSA does not use LTO-like streaming and a different compiler, skip
> > +it. */
> > +  if (strncmp(names[i], "hsa", 3) == 0)
> > +   continue;
> > +
> >offload_names[i]
> > = compile_offload_image (names[i], compiler_path, in_argc, in_argv,
> >  compiler_opts, compiler_opt_count,
> 
> Looks like this would cause the caller loop:
> 
>   if (offload_names)
>   {
> find_offloadbeginend ();
> for (i = 0; offload_names[i]; i++)
>   printf ("%s\n", offload_names[i]);
> free_array_of_ptrs ((void **) offload_names, i);
>   }
> 
> to terminate early if there was another target after hsa.
> 

Good catch.  I have modified this code so that it never leaves any
holes in offload_names[i].

> names[i] is null-terminated, so it looks like you're deliberately
> allowing anything that starts with "hsa" here, but:

Right, and that was probably a mistake, I have changed the check to
simple strcmp.

> 
> > diff --git a/gcc/opts.c b/gcc/opts.c
> > index 874c84f..5647f0c 100644
> > --- a/gcc/opts.c
> > +++ b/gcc/opts.c
> > @@ -1906,8 +1906,35 @@ common_handle_option (struct gcc_options *opts,
> >break;
> >  
> >  case OPT_foffload_:
> > -  /* Deferred.  */
> > -  break;
> > +  {
> > +   const char *p = arg;
> > +   opts->x_flag_disable_hsa = true;
> > +   while (*p != 0)
> > + {
> > +   const char *comma = strchr (p, ',');
> > +
> > +   if ((strncmp (p, "disable", 7) == 0)
> > +   && (p[7] == ',' || p[7] == '\0'))
> > + {
> > +   opts->x_flag_disable_hsa = true;
> > +   break;
> > + }
> > +
> > +   if ((strncmp (p, "hsa", 3) == 0)
> > +   && (p[3] == ',' || p[3] == '\0'))
> > + {
> > +#ifdef ENABLE_HSA
> > +   opts->x_flag_disable_hsa = false;
> > +#else
> > +   sorry ("HSA has not been enabled during configuration");
> > +#endif
> 
> ...here you only allow "hsa" itself.
> 
> (Not your fault, but: do we have any documentation for -foffload
> and -foffload-abi?  Couldn't see any in the texi files.)

Yes, that is actually PR 67300.  However, I do not understand the more
complex forms the parameter can take enough to attempt to fix it.

In order to address all for you concerns, I am going to install the
following on the branch.

Thanks for the feedback,

Martin


2015-12-09  Martin Jambor  

* lto-wrapper.c (compile_images_for_offload_targets): Do not leave
holes in offload_names.  Use strcmp instead strncmp.
* doc/install.texi (--with-hsa-runtime): Fix typo.
---
 gcc/doc/install.texi | 2 +-
 gcc/lto-wrapper.c| 8 +---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/gcc/doc/install.texi b/gcc/doc/install.texi
index afd891c..a85a063 100644
--- a/gcc/doc/install.texi
+++ b/gcc/doc/install.texi
@@ -1993,7 +1993,7 @@ compiler will emit the accelerator code, no path should 
be specified.
 
 If you configure GCC with HSA offloading but do not have the HSA
 run-time library installed in a standard location then you can
-explicitely specify the directory where they are installed.  The
+explicitly specify the directory where they are installed.  The
 @option{--with-hsa-runtime=@/@var{hsainstalldir}} option is a
 shorthand for
 @option{--with-hsa-runtime-lib=@/@var{hsainstalldir}/lib} and
diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
index 5609207..5b58fd6 100644
--- a/gcc/lto-wrapper.c
+++ b/gcc/lto-wrapper.c
@@ -736,6 +736,7 @@ compile_images_for_offload_targets (unsigned in_argc, char 
*in_argv[],
 return;
   unsigned num_targets = parse_env_var (target_names, , NULL);
 
+  int next_name_entry = 0;
   const char *compiler_path = getenv ("COMPILER_PATH");
   if (!compiler_path)
 goto out;
@@ -747,16 +748,17 @@ compile_images_for_offload_targets (unsigned in_argc, 
char *in_argv[],
 {
   /* HSA does not use LTO-like streaming and a different compiler, skip
 it. */
-  if (strncmp(names[i], "hsa", 3) == 0)
+  if (strcmp (names[i], "hsa") == 0)
continue;
 
-

Re: [hsa 1/10] Configury changes and new options

Hi,

On Mon, Dec 07, 2015 at 12:19:08PM +0100, Martin Jambor wrote:
> Hi,
> 
> this patch contains changes to the configuration mechanism and offload
> bits, so that users can build compilers with HSA support.

when writing up how to build an HSA-enabled GCC for the wiki page, and
checking the process actually works I realized that AMD no longer
ships libhsakmt as a part of the run time.  So we either have to tell
users to copy the library over to the same directory where the
run-time is (what I did on my machines and then forgot about it) or
provide one more configuration option, otherwise configure libgomp
fails.  The patch below does the second.  I have verified that the
configuration works as intended fo freshly downloaded/built HSA
run-time and libhsakmt.  I am going to commit it to the branch shortly
and of course would need it as part of hsa merge,

Sorry for realizing this so late,

Martin

2015-12-09  Martin Jambor  

libgomp/
* plugin/configfrag.ac (hsa-kmt-lib): New.

gcc/
* doc/install.texi (Configuration): Document --with-hsa-kmt-lib.
---
 gcc/doc/install.texi |  5 +
 libgomp/configure| 17 +++--
 libgomp/plugin/configfrag.ac |  8 
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/gcc/doc/install.texi b/gcc/doc/install.texi
index 232586d..afd891c 100644
--- a/gcc/doc/install.texi
+++ b/gcc/doc/install.texi
@@ -1999,6 +1999,11 @@ shorthand for
 @option{--with-hsa-runtime-lib=@/@var{hsainstalldir}/lib} and
 @option{--with-hsa-runtime-include=@/@var{hsainstalldir}/include}.

+@item --with-hsa-kmt-lib=@var{pathname}
+
+If you configure GCC with HSA offloading but do not have the HSA
+KMT library installed in a standard location then you can
+explicitly specify the directory where it resides.
 @end table

 @subheading Cross-Compiler-Specific Options
diff --git a/libgomp/plugin/configfrag.ac b/libgomp/plugin/configfrag.ac
index c50e5cb..fd77429 100644
--- a/libgomp/plugin/configfrag.ac
+++ b/libgomp/plugin/configfrag.ac
@@ -118,6 +118,14 @@ if test "x$HSA_RUNTIME_LIB" != x; then
   HSA_RUNTIME_LDFLAGS=-L$HSA_RUNTIME_LIB
 fi

+AC_ARG_WITH(hsa-kmt-lib,
+   [AS_HELP_STRING([--with-hsa-kmt-lib=PATH],
+   [specify directory for installed HSA KMT library.])])
+if test "x$with_hsa_kmt_lib" != x; then
+  HSA_RUNTIME_LDFLAGS="$HSA_RUNTIME_LDFLAGS -L$with_hsa_kmt_lib"
+  HSA_RUNTIME_LIB=
+fi
+
 PLUGIN_HSA=0
 PLUGIN_HSA_CPPFLAGS=
 PLUGIN_HSA_LDFLAGS=
-- 
2.6.3

Re: [PATCH][combine] Don't create LSHIFTRT of zero bits in change_zero_ext

2015-12-10 Thread Segher Boessenkool

On Thu, Dec 10, 2015 at 05:05:12PM +0100, Bernd Schmidt wrote:
> On 12/10/2015 03:36 PM, Kyrill Tkachov wrote:
> >I'm okay with delaying this for next stage 1 if people prefer, though I
> >think it's
> >pretty low risk.
> 
> I think this is something we should fix now.

I agree.

> >+  x = XEXP (x, 0);
> >+  if (start > 0)
> >+x = gen_rtx_LSHIFTRT (mode, x, GEN_INT (start));
> 
> I think this should just use simplify_shift_const. gen_rtx_FOO should be 
> avoided.

A lot of combine does that, it really is stuck in the 80's.  I wouldn't
use simplify_shift_const here, but simply simplify_gen_binary.

The patch is okay with or without that change.


Segher

Reduce global decl stream

2015-12-10 Thread Jan Hubicka

Hi,
this patch saves about 30% of global decl stream size in firefox.  While
implementing the lto sections for initializers I put very stupid heursitcs
to get_symbol_initial_value deciding whether the initializer is better streamed
inline or offline.  This ignores strings and may get bit out of hand.

With this patch and the compression, the largest ltrans unit is 
118479156 bytes and 103584016 out of that is a global decl stream.

Bootstrapped/regtested x86_64-linux OK?

Honza

* lto-streamer-out.c (subtract_estimated_size): New function
(get_symbol_initial_value): Use it.
Index: lto-streamer-out.c
===
--- lto-streamer-out.c  (revision 231546)
+++ lto-streamer-out.c  (working copy)
@@ -309,6 +309,33 @@ lto_is_streamable (tree expr)
 || TREE_CODE_CLASS (code) != tcc_statement);
 }
 
+/* Very rough estimate of streaming size of the initializer.  If we ignored
+   presence of strings, we could simply just count number of non-indexable
+   tree nodes and number of references to indexable nodes.  Strings however
+   may be very large and we do not want to dump them int othe global stream.
+
+   Count the size of initializer until the size in DATA is positive.  */
+
+static tree
+subtract_estimated_size (tree *tp, int *ws, void *data)
+{
+  long *sum = (long *)data;
+  if (tree_is_indexable (*tp))
+{
+  *sum -= 4;
+  *ws = 0;
+}
+  if (TREE_CODE (*tp) == STRING_CST)
+*sum -= TREE_STRING_LENGTH (*tp) + 8;
+  if (TREE_CODE (*tp) == IDENTIFIER_NODE)
+*sum -= IDENTIFIER_LENGTH (*tp) + 8;
+  else
+*sum -= 16;
+  if (*sum < 0)
+return *tp;
+  return NULL_TREE;
+}
+
 
 /* For EXPR lookup and return what we want to stream to OB as DECL_INITIAL.  */
 
@@ -329,10 +356,16 @@ get_symbol_initial_value (lto_symtab_enc
   varpool_node *vnode;
   /* Extra section needs about 30 bytes; do not produce it for simple
 scalar values.  */
-  if (TREE_CODE (DECL_INITIAL (expr)) == CONSTRUCTOR
- || !(vnode = varpool_node::get (expr))
+  if (!(vnode = varpool_node::get (expr))
  || !lto_symtab_encoder_encode_initializer_p (encoder, vnode))
 initial = error_mark_node;
+  if (initial != error_mark_node)
+   {
+ long max_size = 30;
+ if (walk_tree (, subtract_estimated_size, (void *)_size,
+NULL))
+   initial = error_mark_node;
+   }
 }
 
   return initial;

Register pressure aware loop unrolling

2015-12-10 Thread Shiva Chen

Hi all,

Loop unrolling would decrease performance in some case due to unrolling high 
register pressure
loops and produce lots of spill code.

I try to implement register pressure aware loop 
unrolling(-funroll-loops-pressure-aware)
to avoid unrolling high register pressure loops.

The idea is to calculate live register number for each basic block to estimate 
register pressure.
If the loop contain the basic block with high register pressure, don't 
unrolling it.

Register pressure estimate to high if (live_reg_num > available_hard_reg_num)
like ira and gcse did.

However, loop unrolling may increase the register pressure.
Therefore, when live register number near to available hard register number,
it's not profitable to unroll it.
E.g. live_reg_num = 12, available_hard_reg_num = 14

To estimate the register pressure increment after unrolling, adding parameter 
PARAM_LOOP_UNROLL_PRESSURE_INCREMENT.

The equation become 

high_reg_pressure_p = true if (live_reg_num + 
PARAM_LOOP_UNROLL_PRESSURE_INCREMENT > available_hard_reg_num)

Bootstrapped and tested on x86-64.

Any suggestion ?

Thanks,
Shiva


2015-12-11  Shiva Chen  

* cfgloop.h (struct loop): Add high_reg_pressure_p field.
(UAP_UNROLL_PRESSURE_AWARE): New enums.
* reg-pressure.h (struct bb_data): Data stored for each basic block.
* common.opt: Add new flag -funroll-loops-pressure-aware.
* params.def: Add parameter PARAM_LOOP_UNROLL_PRESSURE_INCREMENT.
* loop-init.c(pass_rtl_unroll_loops:gate):
Enable unrolling while -funroll-loops-pressure-aware enable.
(pass_rtl_unroll_loops:execute): Likewise.
* toplev.c (process_options): Likewise.
* loop-unroll.c (decide_unroll_constant_iterations):
Not unroll the loop with high register pressure 
if -funroll-loops-pressure-aware enable.
(decide_unroll_runtime_iterations): Likewise.
(decide_unroll_stupid): Likewise.
* reg-pressure.c (get_regno_pressure_class): Get pressure class.
(change_pressure): Change register pressure while scan basic blocks.
(calculate_bb_reg_pressure): Calculate register pressure
for each basic block.
(check_register_pressure): Determine high_reg_pressure_p for each loop.
(calculate_loop_pressure): Calculate register pressure for each loop.

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index d2d09f6..6c62b63 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1384,6 +1384,7 @@ OBJS = \
reginfo.o \
regrename.o \
regstat.o \
+   reg-pressure.o \
reload.o \
reload1.o \
reorg.o \
diff --git a/gcc/cfgloop.h b/gcc/cfgloop.h
index ee73bf9..c4a06e7 100644
--- a/gcc/cfgloop.h
+++ b/gcc/cfgloop.h
@@ -191,6 +191,9 @@ struct GTY ((chain_next ("%h.next"))) loop {
   /* True if we should try harder to vectorize this loop.  */
   bool force_vectorize;

+  /* True if the loop have high register pressure.  */
+  bool high_reg_pressure_p;
+
   /* True if the loop is part of an oacc kernels region.  */
   bool in_oacc_kernels_region;

@@ -742,7 +745,8 @@ loop_optimizer_finalize ()
 enum
 {
   UAP_UNROLL = 1,  /* Enables unrolling of loops if it seems profitable.  
*/
-  UAP_UNROLL_ALL = 2   /* Enables unrolling of all loops.  */
+  UAP_UNROLL_ALL = 2,  /* Enables unrolling of all loops.  */
+  UAP_UNROLL_PRESSURE_AWARE = 4/* Enables unrolling of loops with 
pressure aware.  */
 };

 extern void doloop_optimize_loops (void);
diff --git a/gcc/common.opt b/gcc/common.opt
index e593631..a44c7dc 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2425,6 +2425,10 @@ funroll-all-loops
 Common Report Var(flag_unroll_all_loops) Optimization
 Perform loop unrolling for all loops.

+funroll-loops-pressure-aware
+Common Report Var(flag_unroll_loops_pressure_aware) Optimization
+Perform loop unrolling for low register pressure loops
+
 ; Nonzero means that loop optimizer may assume that the induction variables
 ; that control loops do not overflow and that the loops with nontrivial
 ; exit condition are not infinite
diff --git a/gcc/loop-init.c b/gcc/loop-init.c
index e32c94a..232465c 100644
--- a/gcc/loop-init.c
+++ b/gcc/loop-init.c
@@ -560,7 +560,8 @@ public:
   /* opt_pass methods: */
   virtual bool gate (function *)
 {
-  return (flag_peel_loops || flag_unroll_loops || flag_unroll_all_loops);
+  return (flag_peel_loops || flag_unroll_loops || flag_unroll_all_loops
+ || flag_unroll_loops_pressure_aware);
 }

   virtual unsigned int execute (function *);
@@ -580,6 +581,8 @@ pass_rtl_unroll_loops::execute (function *fun)
flags |= UAP_UNROLL;
   if (flag_unroll_all_loops)
flags |= UAP_UNROLL_ALL;
+  if (flag_unroll_loops_pressure_aware)
+   flags |= UAP_UNROLL_PRESSURE_AWARE;

   unroll_loops (flags);
 }
diff --git a/gcc/loop-unroll.c b/gcc/loop-unroll.c
index 3e26b21..cffd9140 100644
---

[v3] avoid alignment of static variables affecting stack's

2015-12-10 Thread Jan Beulich

Function (or more narrow) scope static variables (as well as others not
placed on the stack) should also not have any effect on the stack
alignment. I noticed the issue first with Linux'es dynamic_pr_debug()
construct using an 8-byte aligned sub-file-scope local variable.

According to my checking bad behavior started with 4.6.x (4.5.3 was
still okay), but generated code got quite a bit worse as of 4.9.0.

[v3: Re-base to current trunk.]
[v2: Drop inclusion of hard register variables, as requested by
 Jakub and Richard.]

gcc/
2015-12-10  Jan Beulich  

* cfgexpand.c (expand_one_var): Exclude static and external
variables when adjusting stack alignment related state.

gcc/testsuite/
2015-12-10  Jan Beulich  

* gcc.c-torture/execute/stkalign.c: New.

--- 2015-12-09/gcc/cfgexpand.c
+++ 2015-12-09/gcc/cfgexpand.c
@@ -1544,12 +1544,15 @@ static HOST_WIDE_INT
 expand_one_var (tree var, bool toplevel, bool really_expand)
 {
   unsigned int align = BITS_PER_UNIT;
+  bool stack = true;
   tree origvar = var;
 
   var = SSAVAR (var);
 
   if (TREE_TYPE (var) != error_mark_node && TREE_CODE (var) == VAR_DECL)
 {
+  stack = !TREE_STATIC (var) && !DECL_EXTERNAL (var);
+
   /* Because we don't know if VAR will be in register or on stack,
 we conservatively assume it will be on stack even if VAR is
 eventually put into register after RA pass.  For non-automatic
@@ -1578,7 +1581,8 @@ expand_one_var (tree var, bool toplevel,
align = POINTER_SIZE;
 }
 
-  record_alignment_for_reg_var (align);
+  if (stack)
+record_alignment_for_reg_var (align);
 
   if (TREE_CODE (origvar) == SSA_NAME)
 {
--- 2015-12-09/gcc/testsuite/gcc.c-torture/execute/stkalign.c
+++ 2015-12-09/gcc/testsuite/gcc.c-torture/execute/stkalign.c
@@ -0,0 +1,26 @@
+/* { dg-options "-fno-inline" } */
+
+#include 
+
+#define ALIGNMENT 64
+
+unsigned test(unsigned n, unsigned p)
+{
+  static struct { char __attribute__((__aligned__(ALIGNMENT))) c; } s;
+  unsigned x;
+
+  assert(__alignof__(s) == ALIGNMENT);
+  asm ("" : "=g" (x), "+m" (s) : "0" ());
+
+  return n ? test(n - 1, x) : (x ^ p);
+}
+
+int main (int argc, char *argv[] __attribute__((unused)))
+{
+  unsigned int x = test(argc, 0);
+
+  x |= test(argc + 1, 0);
+  x |= test(argc + 2, 0);
+
+  return !(x & (ALIGNMENT - 1));
+}



avoid alignment of static variables affecting stack's

Function (or more narrow) scope static variables (as well as others not
placed on the stack) should also not have any effect on the stack
alignment. I noticed the issue first with Linux'es dynamic_pr_debug()
construct using an 8-byte aligned sub-file-scope local variable.

According to my checking bad behavior started with 4.6.x (4.5.3 was
still okay), but generated code got quite a bit worse as of 4.9.0.

[v3: Re-base to current trunk.]
[v2: Drop inclusion of hard register variables, as requested by
 Jakub and Richard.]

gcc/
2015-12-10  Jan Beulich  

* cfgexpand.c (expand_one_var): Exclude static and external
variables when adjusting stack alignment related state.

gcc/testsuite/
2015-12-10  Jan Beulich  

* gcc.c-torture/execute/stkalign.c: New.

--- 2015-12-09/gcc/cfgexpand.c
+++ 2015-12-09/gcc/cfgexpand.c
@@ -1544,12 +1544,15 @@ static HOST_WIDE_INT
 expand_one_var (tree var, bool toplevel, bool really_expand)
 {
   unsigned int align = BITS_PER_UNIT;
+  bool stack = true;
   tree origvar = var;
 
   var = SSAVAR (var);
 
   if (TREE_TYPE (var) != error_mark_node && TREE_CODE (var) == VAR_DECL)
 {
+  stack = !TREE_STATIC (var) && !DECL_EXTERNAL (var);
+
   /* Because we don't know if VAR will be in register or on stack,
 we conservatively assume it will be on stack even if VAR is
 eventually put into register after RA pass.  For non-automatic
@@ -1578,7 +1581,8 @@ expand_one_var (tree var, bool toplevel,
align = POINTER_SIZE;
 }
 
-  record_alignment_for_reg_var (align);
+  if (stack)
+record_alignment_for_reg_var (align);
 
   if (TREE_CODE (origvar) == SSA_NAME)
 {
--- 2015-12-09/gcc/testsuite/gcc.c-torture/execute/stkalign.c
+++ 2015-12-09/gcc/testsuite/gcc.c-torture/execute/stkalign.c
@@ -0,0 +1,26 @@
+/* { dg-options "-fno-inline" } */
+
+#include 
+
+#define ALIGNMENT 64
+
+unsigned test(unsigned n, unsigned p)
+{
+  static struct { char __attribute__((__aligned__(ALIGNMENT))) c; } s;
+  unsigned x;
+
+  assert(__alignof__(s) == ALIGNMENT);
+  asm ("" : "=g" (x), "+m" (s) : "0" ());
+
+  return n ? test(n - 1, x) : (x ^ p);
+}
+
+int main (int argc, char *argv[] __attribute__((unused)))
+{
+  unsigned int x = test(argc, 0);
+
+  x |= test(argc + 1, 0);
+  x |= test(argc + 2, 0);
+
+  return !(x & (ALIGNMENT - 1));
+}

Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta

2015-12-10 Thread Tom de Vries

[ copy-pasting-with-quote from 
https://gcc.gnu.org/ml/gcc-patches/2015-12/msg00420.html , for some 
reason I didn't get this email ]



On Thu, 3 Dec 2015, Tom de Vries wrote:

The flag is set here in expand_omp_target:
...
12682 /* Prevent IPA from removing child_fn as unreachable,
 since there are no
12683refs from the parent function to child_fn in offload
 LTO mode.  */
12684 if (ENABLE_OFFLOADING)
12685   cgraph_node::get (child_fn)->mark_force_output ();
...



How are there no refs from the "parent"?  Are there not refs from
some kind of descriptor that maps fallback CPU and offloaded variants?


That descriptor is the offload table, which is emitted in 
omp_finish_file. The function iterates over vectors offload_vars and 
offload_funcs.


[ I would guess there's a one-on-one correspondance between 
symtab_node::offloadable and membership of either offload_vars or 
offload_funcs. ]



I think the above needs sorting out in somw way, making the refs
explicit rather than implicit via force_output.


I've tried an approach where I add a test for node->offloadable next to 
each test for node->force_output, except for the test in the nonlocal_p 
def in ipa_pta_execute. But I didn't (yet) manage to make that work.



I guess setting forced_by_abi instead would also mean child_fn is not removed
as unreachable, while still allowing optimizations:
...
  /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
 to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
 symbols promoted to static and it does not inhibit
 optimization.  */
  unsigned forced_by_abi : 1;
...

But I suspect that other optimizations (than ipa-pta) might break things.


How so?


Probably it's more accurate to say that I do not understand the 
difference very well between force_output and force_by_abi, and what is 
the class of optimizations enabled by using forced_by_abi instead of 
force_output.'



Essentially we have two situations:
- in the host compiler, there is no need for the forced_output flag,
  and it inhibits optimization
- in the accelerator compiler, it (or some equivalent) is needed


Actually, things are slightly more complicated, I realize now. There's 
also the distinction between:

- symbols declared as offloadable in the source code, and
- symbols create by the compiler and marked offloadable


I wonder if setting the force_output flag only when streaming the bytecode for
offloading would work. That way, it wouldn't be set in the host compiler,
while being set in the accelerator compiler.


Yeah, that was my original thinking btw.


FTR, I've tried that approach, as attached. It fixed the 
goacc/kernels-alias-ipa-pta*.c failures. And I ran target-libgomp (also 
using an accelerator configuration) without any regressions.


Thanks,
- Tom

Set force_output in offload stream if offloadable

---
 gcc/lto-cgraph.c | 2 +-
 gcc/omp-low.c| 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 62e5454..c862b19 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -511,7 +511,7 @@ lto_output_node (struct lto_simple_output_block *ob, struct cgraph_node *node,
   bp_pack_value (, node->local.versionable, 1);
   bp_pack_value (, node->local.can_change_signature, 1);
   bp_pack_value (, node->local.redefined_extern_inline, 1);
-  bp_pack_value (, node->force_output, 1);
+  bp_pack_value (, node->force_output || node->offloadable, 1);
   bp_pack_value (, node->forced_by_abi, 1);
   bp_pack_value (, node->unique_name, 1);
   bp_pack_value (, node->body_removed, 1);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 5643480..569cfd7 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -12770,10 +12770,12 @@ expand_omp_target (struct omp_region *region)
 	assign_assembler_name_if_neeeded (child_fn);
   cgraph_edge::rebuild_edges ();
 
+#if 0
   /* Prevent IPA from removing child_fn as unreachable, since there are no
 	 refs from the parent function to child_fn in offload LTO mode.  */
   if (ENABLE_OFFLOADING)
 	cgraph_node::get (child_fn)->mark_force_output ();
+#endif
 
   /* Some EH regions might become dead, see PR34608.  If
 	 pass_cleanup_cfg isn't the first pass to happen with the

[PATCH] Fix PR68817



Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2015-12-10  Richard Biener  

PR tree-optimization/68817
* tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Detect
gaps early.

* gfortran.dg/pr68817.f90: New testcase.

Index: gcc/tree-vect-slp.c
===
--- gcc/tree-vect-slp.c (revision 231499)
+++ gcc/tree-vect-slp.c (working copy)
@@ -1237,6 +1237,8 @@ vect_attempt_slp_rearrange_stmts (slp_in
   bitmap_clear (load_index);
   FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
 {
+  if (lidx >= group_size)
+   return false;
   if (bitmap_bit_p (load_index, lidx))
{
  sbitmap_free (load_index);
Index: gcc/testsuite/gfortran.dg/pr68817.f90
===
--- gcc/testsuite/gfortran.dg/pr68817.f90   (revision 0)
+++ gcc/testsuite/gfortran.dg/pr68817.f90   (working copy)
@@ -0,0 +1,11 @@
+! { dg-do compile }
+! { dg-options "-O3 -ffast-math" }
+  SUBROUTINE TEST(A,B,C)
+  DIMENSION B(3),C(1000,10)
+  DO I = 1,3
+ I3=I*3
+ B(1) = B(1) + (C(K,I3+1)-A)
+ B(3) = B(3) + (C(K,I3+3)-A)
+  ENDDO
+  END
+

RE: [PATCH] [ARC] Add support for atomic memory built-in.

2015-12-10 Thread Claudiu Zissulescu

Patch applied:  Committed r231509

Thanks,
Claudiu

> -Original Message-
> From: Joern Wolfgang Rennecke [mailto:g...@amylaar.uk]
> Sent: Wednesday, December 09, 2015 6:11 AM
> To: Claudiu Zissulescu; gcc-patches@gcc.gnu.org
> Cc: francois.bed...@synopsys.com; jeremy.benn...@embecosm.com
> Subject: Re: [PATCH] [ARC] Add support for atomic memory built-in.
> 
> 
> 
> On 07/12/15 13:25, Claudiu Zissulescu wrote:
> >
> > Tested with dg.exp (when passing -matomic to gcc compiler line, the
> atomic tests are also successfully executed).
> The comment before "*memory_barrier" could use some elaboration on
> what it does for TARGET_HS.
> Otherwise, this is OK.

[Ada] Comment fixes

Tested on x86-64/Linux, applied on the mainline.


2015-12-10  Eric Botcazou  

* gcc-interface/gigi.h (create_var_decl): Adjust comment.
(create_subprog_decl): Likewise.
* gcc-interface/utils.c (create_var_decl): Likewise.
(create_subprog_decl): Likewise.

-- 
Eric BotcazouIndex: gcc-interface/gigi.h
===
--- gcc-interface/gigi.h	(revision 231498)
+++ gcc-interface/gigi.h	(working copy)
@@ -678,11 +678,11 @@ extern tree create_type_decl (tree name,
CONST_FLAG is true if this variable is constant, in which case we might
return a CONST_DECL node unless CONST_DECL_ALLOWED_P is false.
 
-   PUBLIC_FLAG is true if this definition is to be made visible outside of
-   the current compilation unit. This flag should be set when processing the
-   variable definitions in a package specification.
+   PUBLIC_FLAG is true if this is for a reference to a public entity or for a
+   definition to be made visible outside of the current compilation unit, for
+   instance variable definitions in a package specification.
 
-   EXTERN_FLAG is nonzero when processing an external variable declaration (as
+   EXTERN_FLAG is true when processing an external variable declaration (as
opposed to a definition: no storage is to be allocated for the variable).
 
STATIC_FLAG is only relevant when not at top level and indicates whether
@@ -694,6 +694,8 @@ extern tree create_type_decl (tree name,
 
DEBUG_INFO_P is true if we need to write debug information for it.
 
+   ATTR_LIST is the list of attributes to be attached to the variable.
+
GNAT_NODE is used for the position of the decl.  */
 extern tree create_var_decl (tree name, tree asm_name, tree type, tree init,
 			 bool const_flag, bool public_flag,
@@ -728,13 +730,17 @@ extern tree create_label_decl (tree name
the list of its parameters (a list of PARM_DECL nodes chained through the
DECL_CHAIN field).
 
-   INLINE_STATUS, CONST_FLAG, PUBLIC_FLAG, EXTERN_FLAG, VOLATILE_FLAG as well
-   as ATTR_LIST are used to set the appropriate fields in the FUNCTION_DECL.
+   INLINE_STATUS describes the inline flags to be set on the FUNCTION_DECL.
+
+   CONST_FLAG, PUBLIC_FLAG, EXTERN_FLAG, VOLATILE_FLAG are used to set the
+   appropriate flags on the FUNCTION_DECL.
 
ARTIFICIAL_P is true if the subprogram was generated by the compiler.
 
DEBUG_INFO_P is true if we need to write debug information for it.
 
+   ATTR_LIST is the list of attributes to be attached to the subprogram.
+
GNAT_NODE is used for the position of the decl.  */
 extern tree create_subprog_decl (tree name, tree asm_name, tree type,
  tree param_decl_list,
Index: gcc-interface/utils.c
===
--- gcc-interface/utils.c	(revision 231498)
+++ gcc-interface/utils.c	(working copy)
@@ -2344,6 +2344,8 @@ create_type_decl (tree name, tree type,
 
DEBUG_INFO_P is true if we need to write debug information for it.
 
+   ATTR_LIST is the list of attributes to be attached to the variable.
+
GNAT_NODE is used for the position of the decl.  */
 
 tree
@@ -3048,13 +3050,17 @@ create_label_decl (tree name, Node_Id gn
the list of its parameters (a list of PARM_DECL nodes chained through the
DECL_CHAIN field).
 
-   INLINE_STATUS, CONST_FLAG, PUBLIC_FLAG, EXTERN_FLAG, VOLATILE_FLAG as well
-   as ATTR_LIST are used to set the appropriate fields in the FUNCTION_DECL.
+   INLINE_STATUS describes the inline flags to be set on the FUNCTION_DECL.
+
+   CONST_FLAG, PUBLIC_FLAG, EXTERN_FLAG, VOLATILE_FLAG are used to set the
+   appropriate flags on the FUNCTION_DECL.
 
ARTIFICIAL_P is true if the subprogram was generated by the compiler.
 
DEBUG_INFO_P is true if we need to write debug information for it.
 
+   ATTR_LIST is the list of attributes to be attached to the subprogram.
+
GNAT_NODE is used for the position of the decl.  */
 
 tree

Re: [v3] avoid alignment of static variables affecting stack's


On 12/10/2015 01:38 PM, Jan Beulich wrote:


* cfgexpand.c (expand_one_var): Exclude static and external
variables when adjusting stack alignment related state.

gcc/testsuite/
2015-12-10  Jan Beulich  

* gcc.c-torture/execute/stkalign.c: New.

--- 2015-12-09/gcc/cfgexpand.c
+++ 2015-12-09/gcc/cfgexpand.c
@@ -1544,12 +1544,15 @@ static HOST_WIDE_INT
  expand_one_var (tree var, bool toplevel, bool really_expand)
  {
unsigned int align = BITS_PER_UNIT;
+  bool stack = true;
tree origvar = var;

var = SSAVAR (var);

if (TREE_TYPE (var) != error_mark_node && TREE_CODE (var) == VAR_DECL)
  {
+  stack = !TREE_STATIC (var) && !DECL_EXTERNAL (var);
+
/* Because we don't know if VAR will be in register or on stack,
 we conservatively assume it will be on stack even if VAR is
 eventually put into register after RA pass.  For non-automatic
@@ -1578,7 +1581,8 @@ expand_one_var (tree var, bool toplevel,
align = POINTER_SIZE;
  }

-  record_alignment_for_reg_var (align);
+  if (stack)
+record_alignment_for_reg_var (align);


A bit further down we have

  else if (DECL_EXTERNAL (var))
;
  else if (DECL_HAS_VALUE_EXPR_P (var))
;
  else if (TREE_STATIC (var))
;
[]
  return 0;

so I'm thinking the function doesn't do anything for DECL_EXTERNAL or 
TREE_STATIC vars. You're still computing alignment for them but not 
using it. I suggest just doing an early return for non-stack vars.



Bernd

[Ada] Fix internal error on call taking empty array Out parameter

This is a regression present on the mainline only, for a pathological case 
where an empty array is passed as Out parameter to a procedure, but the code 
is actually never executed because it's inside an empty loop...

Tested on x86-64/Linux, applied on the mainline.


2015-12-10  Eric Botcazou  

* gcc-interface/trans.c (Call_to_gnu): Remove guard for NULL_EXPR.
* gcc-interface/utils2.c (gnat_rewrite_reference) : Return
the reference unmodified.
: New case.  Likewise.


2015-12-10  Eric Botcazou  

* gnat.dg/array25.adb: New test.
* gnat.dg/array25_pkg.ad[sb]: New helper.


-- 
Eric BotcazouIndex: gcc-interface/trans.c
===
--- gcc-interface/trans.c	(revision 231498)
+++ gcc-interface/trans.c	(working copy)
@@ -4407,9 +4407,7 @@ Call_to_gnu (Node_Id gnat_node, tree *gn
   /* If it's possible we may need to use this expression twice, make sure
 	 that any side-effects are handled via SAVE_EXPRs; likewise if we need
 	 to force side-effects before the call.  */
-  if (Ekind (gnat_formal) != E_In_Parameter
-	  && !is_by_ref_formal_parm
-	  && TREE_CODE (gnu_name) != NULL_EXPR)
+  if (Ekind (gnat_formal) != E_In_Parameter && !is_by_ref_formal_parm)
 	{
 	  tree init = NULL_TREE;
 	  gnu_name = gnat_stabilize_reference (gnu_name, true, );
Index: gcc-interface/utils2.c
===
--- gcc-interface/utils2.c	(revision 231498)
+++ gcc-interface/utils2.c	(working copy)
@@ -2733,7 +2733,8 @@ gnat_rewrite_reference (tree ref, rewrit
   break;
 
 case ERROR_MARK:
-  return error_mark_node;
+case NULL_EXPR:
+  return ref;
 
 default:
   gcc_unreachable ();
-- { dg-do compile }

with Array25_Pkg;

procedure Array25 is

   package My_Pkg is new Array25_Pkg (0, 0);

begin
   null;
end;
package body Array25_Pkg is

   procedure Get_Inner (A : out Arr1) is
   begin
  null;
   end;

   procedure Get (A : out Arr2) is
   begin
  for I in Arr2'Range loop
 Get_Inner (A (I).Data);
  end loop;
   end;

end Array25_Pkg;
generic

   UB1 : Natural;

   UB2 : Natural;

package Array25_Pkg is

   type Arr1 is array (1 .. UB1) of Integer;

   type Rec is record
  Data : Arr1;
   end record;

   type Arr2  is array (1 .. UB2) of Rec;

   procedure Get (A : out Arr2);

end Array25_Pkg;

Re: [RFA] [PATCH] [PR tree-optimization/68619] Avoid direct cfg cleanups in tree-ssa-dom.c [0/3] v2

On Thu, Dec 10, 2015 at 12:23 AM, Jeff Law  wrote:
> Richi and I have been discussing revamping slightly how DOM handles
> conditionals which it detects are always true or always false.
>
> During gcc6 stage1 I added code to allow DOM to clean them up
> immediately, primarily to avoid the waste of having the threader handle
> those cases.  It was also believed that by cleaning things up during the
> DOM walk we could realize some secondary benefits (certain PHIs become
> more likely to collapse down to a const/copy which can then be propagated).
>
> That code causes an interesting problem as shown by 68619.  Essentially
> the CFG has 3 loops, one is a natural loop, the other two are irreducible.
>
> DOM finds conditionals which it can optimize to true/false.  It removes
> the unreachable edges and everything seems perfect.  Except that removal
> of those edges causes the irreducible loops become reducible.  This is a
> good thing, except
>
> Now we have two new natural loops, which triggers a checking failure
> because we haven't set up loop structures for the newly exposed natural
> loops.
>
> Richi's suggestion (before this problem was reported) was to have DOM
> leave the CFG alone, but otherwise optimize as-if the edges had been
> removed.  Final removal of the edges would be left to cfg_cleanup.  He
> also pointed me at SCCVN which does something similar.
>
> Further iteration with Richi has led us to extending the dom walker
> interface to directly support propagation of unexecutable properties for
> clients that want that improvement.
>
> To use the new capability, the client passes a new parameter to the walker
> constructor and in its before_dom_children callback the client returns
> either the taken edge when a COND, SWITCH or GOTO collapse to a single
> compile-time destination or NULL otherwise.
>
> The walker will take that information and determine which edges are no
> longer executable.  If that in turn causes blocks to become unreachable the
> walker can then mark further edges as non-executable.
>
> The walker clears EDGE_EXECUTABLE on any edges that are deemed not
> executable.  The client can use this to further refine analysis and
> optimizations.
>
> The walker will no longer call the before/after_dom_children callbacks for
> unreachable blocks.
>
> The change is broken into 4 parts for ease of review.  They must be
> committed together due to the API change in the walker.  They have been
> bootstrapped and regression tested as a unit on x86_64-linux-gnu.
>
>
> 1. Refactor the code from tree-ssa-sccvn.c into domwalk.c.  Essentially the
> constructor is no longer trivial as it may need to initialize
> EDGE_EXECUTABLE.  The return type of the before_dom_children callback
> changes from void to an edge.  Two additional private member functions are
> added to check a block for reachability and to propagate unexecutable
> properties.  Included are the changes to sccvn to use the new capabilities.
>
> 2. Use the new capabilities in tree-ssa-dom.c.
>
> 3. New tests.  One is the actual 68619 testcase.  Two ICEs for minor
> bugs found during development/testing, one case where we optimize better
> now than before, one for a missed optimization during development.
>
> 4. Mechanical changes to the other dom walkers. The return type of the
> before_dom_children callback changes from void to edge. For the callbacks
> changed in this patch, we always return NULL.
>
>
> How does this look now?

The series looks good to me.

Thanks,
Richard.

>
> Jeff

Re: [PATCH] Fix up noce_try_abs again (PR rtl-optimization/68670)

On Thu, Dec 10, 2015 at 02:20:04PM +0100, Bernd Schmidt wrote:
> This turns into a much simpler if statement, doesn't it? Ok with that
> change.

Ok, will do, thanks.

> >+/* PR rtl-optimization/68670 */
> >+/* { dg-do run } */
> >+/* { dg-options "-O2 -ftracer" } */
> 
> Curious what effect -ftracer has here?

In one case it was seeing all test_bb/then_bb/else_bb/join_bb,
where one of then_bb/else_bb (don't remember which one) contained
y = ~x and another one y = x, while the other case had no else_bb,
with already test_bb doing y = x and then_bb then doing y = ~y.
Correspondingly different condition, different negate value.

Jakub

Re: [PATCH] Fix detection of setrlimit in libstdc++ testsuite

2015-12-10 Thread Maxim Kuvyrkov

> On Nov 11, 2015, at 7:56 PM, Maxim Kuvyrkov  wrote:
> 
> Hi,
> 
> This patch fixes an obscure cross-testing problem that crashed (OOMed) our 
> boards at Linaro.  Several tests in libstdc++ (e.g., [1]) limit themselves to 
> some reasonable amount of RAM and then try to allocate 32 gigs.  
> Unfortunately, the configure test that checks presence of setrlimit is rather 
> strange: if target is native, then try compile file with call to setrlimit -- 
> if compilation succeeds, then use setrlimit, otherwise, ignore setrlimit.  
> The strange part is that the compilation check is done only for native 
> targets, as if cross-toolchains can't generate working executables.  [This is 
> rather odd, and I might be missing some underlaying caveat.]
> 
> Therefore, when testing a cross toolchain, the test [1] still tries to 
> allocate 32GB of RAM with no setrlimit restrictions.  On most targets that 
> people use for cross-testing this is not an issue because either
> - the target is 32-bit, so there is no 32GB user-space to speak of, or
> - the target board has small amount of RAM and no swap, so allocation 
> immediately fails, or
> - the target board has plenty of RAM, so allocating 32GB is not an issue.
> 
> However, if one is testing on a 64-bit board with 16GB or RAM and 16GB of 
> swap, then one gets into an obscure near-OOM swapping condition.  This is 
> exactly the case with cross-testing aarch64-linux-gnu toolchains on APM 
> Mustang.
> 
> The attached patch removes "native" restriction from configure test for 
> setrlimit.  This enables setrlimit restrictions on the testsuite, and the 
> test [1] expectedly fails to allocate 32GB due to setrlimit restriction.
> 
> I have tested it on x86_64-linux-gnu and i686-linux-gnu native toolchains, 
> and aarch64-linux-gnu and arm-linux-gnueabi[hf] cross-toolchains with no 
> regressions [*].
> 
> OK to commit?
> 
> I didn't go as far as enabling setenv/locale tests when cross-testing 
> libstdc++ because I remember of issues with generating locales in cross-built 
> glibc.  In any case, locale tests are unlikely to OOM the test board the way 
> that absence of setrlimit does.
> 
> [1] 27_io/ios_base/storage/2.cc
> 
> [*] Cross-testing using user-mode QEMU made 27_io/fpos/14775.cc execution 
> test to FAIL.  This test uses setrlimit set max file size, and is misbehaving 
> only under QEMU.  I believe this a QEMU issue with not handling setrlimit 
> correctly.
> 

Ping.

--
Maxim Kuvyrkov
www.linaro.org

[PATCH][RX] v2 instructions support.

2015-12-10 Thread Yoshinori Sato

Add RX v2 enhancement instructions support.

gcc/ChangeLog
2015/12/10  Yoshinori Sato  

* config/rx/constraints.md: Add constraint "q".
* doc/md.texi: Likewise.
* config/rx/rx-opts.h(rx_cpu_types): Add type RXV2.
* config/rx/rx.c(rx_print_operand): Add a0, a1 and extb.
(rx_expand_prologue): Use v2 instraction.
(rx_expand_epilogue): Likewise.
(rx_builtin): Add v2 instruction.
(rx_expand_builtin): Likewise.
(rx_expand_builtin_mac): 3 operand support.
(rx_expand_int_builtin_1_arg): Likewise.
(rx_expand_int_builtin_0_arg): Delete.
(rx_expand_builtin_mac2): New function.
(rx_expand_builtin_fromacc): Likewise.
(rx_expand_builtin_fromacc2): Likewise.
(rx_expand_builtin_toacc): Likewise.
(rx_expand_builtin_toacc2): Likewise.
(rx_expand_builtin_rac): Likewise.
* config/rx/rx.h(TARGET_CPU_CPP_BUILTINS): Add v2 define.
(ALLOW_RXV2_INSNS): New.
* config/rx/rx.md(constants): Add new builtin enum.
(addsf3): 3 operands support.
(mulsf3): Likewise.
(subsf3): Likewise.
(fixuns_truncsfsi2): New.
(floatunssisf2): Likewise.
(sqrtsf2): Likewise.
(machi): 3 operands support.
(maclo): Likewise.
(mulhi): Likewise.
(mullo): Likewise.
(mvfachi): Likewise.
(mvfacmi): Likewise.
(mvtachi): Likewise.
(mvtaclo): Likewise.
(racw): Likewise.
(mvfacgu): New.
(mvfaclo): Likewise.
(racl): Likewise.
(rdacl): Likewise.
(rdacw): Likewise.
(emaca): Likewise.
(emsba): Likewise.
(maclh): Likewise.
(msbhi): Likewise.
(msblo): Likewise.
(msblh): Likewise.
* config/rx/sync.md: New file.
* config/rx/rx.opt: Add rxv2 for mcpu option.
* config/rx/t-rx: Add v2 for multilib

---
 gcc/config/rx/constraints.md |   7 +
 gcc/config/rx/rx-opts.h  |   3 +-
 gcc/config/rx/rx.c   | 425 +--
 gcc/config/rx/rx.h   |   7 +
 gcc/config/rx/rx.md  | 363 +---
 gcc/config/rx/rx.opt |   3 +
 gcc/config/rx/sync.md| 191 +++
 gcc/config/rx/t-rx   |   2 +
 gcc/doc/md.texi  |   2 +
 9 files changed, 915 insertions(+), 88 deletions(-)
 create mode 100644 gcc/config/rx/sync.md

diff --git a/gcc/config/rx/constraints.md b/gcc/config/rx/constraints.md
index b41c232..e388350 100644
--- a/gcc/config/rx/constraints.md
+++ b/gcc/config/rx/constraints.md
@@ -106,3 +106,10 @@
)
   )
 )
+
+(define_memory_constraint "q"
+  "A MEM which only uses REG addressing."
+  (and (match_code "mem")
+(match_code "reg" "0")
+  )
+)
diff --git a/gcc/config/rx/rx-opts.h b/gcc/config/rx/rx-opts.h
index fa83e91..52d4dce 100644
--- a/gcc/config/rx/rx-opts.h
+++ b/gcc/config/rx/rx-opts.h
@@ -25,7 +25,8 @@ enum rx_cpu_types
   RX600,
   RX610,
   RX200,
-  RX100
+  RX100,
+  RXV2
 };
 
 #endif
diff --git a/gcc/config/rx/rx.c b/gcc/config/rx/rx.c
index 781b6b1..7b35308 100644
--- a/gcc/config/rx/rx.c
+++ b/gcc/config/rx/rx.c
@@ -639,6 +639,19 @@ rx_print_operand (FILE * file, rtx op, int letter)
case 0xa: fprintf (file, "isp"); break;
case 0xb: fprintf (file, "fintv"); break;
case 0xc: fprintf (file, "intb"); break;
+   case 0xd:
+ if (ALLOW_RXV2_INSNS)
+   {
+ fprintf (file, "extb"); break;
+   }
+   goto invalid_register;
+   case 0x40:
+   case 0x41:
+ if (ALLOW_RXV2_INSNS)
+   {
+ fprintf (file, "a%ld", INTVAL(op) - 0x40); break;
+   } /* RXv1 fall through */
+   invalid_register:
default:
  warning (0, "unrecognized control register number: %d - using 'psw'",
   (int) INTVAL (op));
@@ -731,6 +744,11 @@ rx_print_operand (FILE * file, rtx op, int letter)
   fprintf (file, "%s", reg_names [rx_pid_base_regnum ()]);
   break;
 
+case 'V':
+  gcc_assert (CONST_INT_P (op));
+  fprintf (file, "a%ld", INTVAL(op));
+  break;
+
 case 'R':
   gcc_assert (GET_MODE_SIZE (GET_MODE (op)) <= 4);
   unsigned_load = true;
@@ -1772,12 +1790,32 @@ rx_expand_prologue (void)
  /* We have assumed that there are at least two registers pushed... */
  gcc_assert (acc_high != 0);
 
- /* Note - the bottom 16 bits of the accumulator are inaccessible.
-We just assume that they are zero.  */
- emit_insn (gen_mvfacmi (gen_rtx_REG (SImode, acc_low)));
- emit_insn (gen_mvfachi (gen_rtx_REG (SImode, acc_high)));
- emit_insn (gen_stack_push (gen_rtx_REG (SImode, acc_low)));
- emit_insn (gen_stack_push (gen_rtx_REG (SImode, acc_high)));
+ if (!ALLOW_RXV2_INSNS)
+   {
+ /* Note - the

[PATCH] Fix IPA PTA wrong-code bug with LTO


The following patch fixes a latent issue in IPA PTA with LTO where
it doesn't properly honor partition boundaries.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2015-12-10  Richard Biener  

* tree-ssa-structalias.c (struct variable_info): Add
is_ipa_escape_point flag.
(new_var_info): Initialize to false.
(find_func_aliases): Generate escape constraints for stores
properly in IPA mode.
(ipa_pta_execute): Compute is_ipa_escape_point for globals.

Index: gcc/tree-ssa-structalias.c
===
--- gcc/tree-ssa-structalias.c  (revision 231498)
+++ gcc/tree-ssa-structalias.c  (working copy)
@@ -160,6 +160,8 @@
The is_global_var bit which marks escape points is overly conservative
in IPA mode.  Split it to is_escape_point and is_global_var - only
externally visible globals are escape points in IPA mode.
+   There is now is_ipa_escape_point but this is only used in a few
+   selected places.
 
The way we introduce DECL_PT_UID to avoid fixing up all points-to
sets in the translation unit when we copy a DECL during inlining
@@ -265,6 +267,9 @@ struct variable_info
   /* True if this represents a global variable.  */
   unsigned int is_global_var : 1;
 
+  /* True if this represents a module escape point for IPA analysis.  */
+  unsigned int is_ipa_escape_point : 1;
+
   /* True if this represents a IPA function info.  */
   unsigned int is_fn_info : 1;
 
@@ -374,6 +379,7 @@ new_var_info (tree t, const char *name,
   ret->is_restrict_var = false;
   ret->ruid = 0;
   ret->is_global_var = (t == NULL_TREE);
+  ret->is_ipa_escape_point = false;
   ret->is_fn_info = false;
   if (t && DECL_P (t))
 ret->is_global_var = (is_global_var (t)
@@ -4779,11 +4785,13 @@ find_func_aliases (struct function *fn,
}
   /* If there is a store to a global variable the rhs escapes.  */
   if ((lhsop = get_base_address (lhsop)) != NULL_TREE
- && DECL_P (lhsop)
- && is_global_var (lhsop)
- && (!in_ipa_mode
- || DECL_EXTERNAL (lhsop) || TREE_PUBLIC (lhsop)))
-   make_escape_constraint (rhsop);
+ && DECL_P (lhsop))
+   {
+ varinfo_t vi = get_vi_for_tree (lhsop);
+ if ((! in_ipa_mode && vi->is_global_var)
+ || vi->is_ipa_escape_point)
+   make_escape_constraint (rhsop);
+   }
 }
   /* Handle escapes through return.  */
   else if (gimple_code (t) == GIMPLE_RETURN
@@ -4794,8 +4802,7 @@ find_func_aliases (struct function *fn,
   if (!in_ipa_mode
  || !(fi = get_vi_for_tree (fn->decl)))
make_escape_constraint (gimple_return_retval (return_stmt));
-  else if (in_ipa_mode
-  && fi != NULL)
+  else if (in_ipa_mode)
{
  struct constraint_expr lhs ;
  struct constraint_expr *rhsp;
@@ -7498,7 +7505,15 @@ ipa_pta_execute (void)
   if (var->alias && var->analyzed)
continue;
 
-  get_vi_for_tree (var->decl);
+  varinfo_t vi = get_vi_for_tree (var->decl);
+
+  /* For the purpose of IPA PTA unit-local globals are not
+ escape points.  */
+  bool nonlocal_p = (var->used_from_other_partition
+|| var->externally_visible
+|| var->force_output);
+  if (nonlocal_p)
+   vi->is_ipa_escape_point = true;
 }
 
   if (dump_file

Re: [testsuite][ARM target attributes] Fix effective_target tests

2015-12-10 Thread Christophe Lyon

On 10 December 2015 at 13:30, Kyrill Tkachov  wrote:
> Hi Christophe,
>
>
> On 08/12/15 11:18, Christophe Lyon wrote:
>>
>> On 8 December 2015 at 11:50, Kyrill Tkachov 
>> wrote:
>>>
>>> Hi Christophe,
>>>
>>>
>>> On 27/11/15 13:00, Christophe Lyon wrote:

 Hi,

 After the recent commits from Christian adding target attributes
 support for ARM FPU settings,  I've noticed that some of the tests
 were failing because of incorrect assumptions wrt to the default
 cpu/fpu/float-abi of the compiler.

 This patch fixes the problems I've noticed in the following way:
 - do not force -mfloat-abi=softfp in dg-options, to avoid conflicts
 when gcc is configured --with-float=hard

 - change arm_vfp_ok such that it tries several -mfpu/-mfloat-abi
 flags, checks that __ARM_FP is defined and __ARM_NEON_FP is not
 defined

 - introduce arm_fp_ok, which is similar but does not enforce fpu setting

 - add a new effective_target: arm_crypto_pragma_ok to check that
 setting this fpu via a pragma is actually supported by the current
 "multilib". This is different from checking the command-line option
 because the pragma might conflict with the command-line options in
 use.

 The updates in the testcases are as follows:
 - attr-crypto.c, we have to make sure that the defaut fpu does not
 conflict with the one forced by pragma. That's why I use the arm_vfp
 options/effective_target. This is needed if gcc has been configured
 --with-fpu=neon-fp16, as the pragma fpu=crypto-neon-fp-armv8 would
 conflict.

 - attr-neon-builtin-fail.c: use arm_fp to force the appropriate
 float-abi setting. Enforcing fpu is not needed here.

 - attr-neon-fp16.c: similar, I also removed arm_neon_ok since it was
 not necessary to make the test pass in my testing. On second thought,
 I'm wondering whether I should leave it and make the test unsupported
 in more cases (such as when forcing -march=armv5t, although it does
 pass with this patch)

 - attr-neon2.c: use arm_vfp to force the appropriate float-abi
 setting. Enforcing mfpu=vfp is needed to avoid conflict with the
 pragma target fpu=neon (for instance if the toolchain default is
 neon-fp16)

 - attr-neon3.c: similar

 Tested on a variety of configurations, see:


 http://people.linaro.org/~christophe.lyon/cross-validation/gcc-test-patches/230929-target-attr/report-build-info.html

 Note that the regressions reported fall into 3 categories:
 - when forcing march=armv5t: tests are now unsupported because I
 modified arm_crypto_ok to require arm_v8_neon_ok instead of arm32.

 - the warning reported by attr-neon-builtin-fail.c moved from line 12
 to 14 and is thus seen as a regression + one improvement

 - finally, attr-neon-fp16.c causes an ICE on armeb compilers, for
 which I need to post a bugzilla.


 TBH, I'm a bit concerned by the complexity of all these multilib-like
 conditions. I'm confident that I'm still missing some combinations :-)

 And with new target attributes coming, new architectures etc... all
 this logic is likely to become even more complex.

 That being said, OK for trunk?
>>>
>>>
>>> I'd like us to clean up and reorganise the gcc.target/arm testsuite
>>> at some point to make it more robust with respect to the tons of multilib
>>> options and configurations we can have for arm. It's becoming very
>>> frustrating
>>> to test feature-specific stuff :(
>>>
>>> This is ok in the meantime.
>>> Sorry for the delay.
>>>
>> Thanks for the approval, glad to see I'm not the only one willing to see
>> improvements in this area :)
>>
>> Committed as r231403.
>
>
> With this patch we're seeing legitimate PASSes go to NA.
> For example:
>
> gcc.target/arm/vfp-1.c
> gcc.target/arm/vfp-ldmdbs.c
> and other vfp tests in arm.exp.
> This is, for example, for the
> variant-mthumb/-march=armv8-a/-mfpu=crypto-neon-fp-armv8/-mfloat-abi=hard
>
Hmm I'm attempting to cover such a configuration in my matrix of validations:
http://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/231403/report-build-info.html

The difference is that I use similar flags at GCC configure time, while you
override them when running the testsuite:
--target arm-none-linux-gnueabihf --with-mode=thum --with-cpu=cortex-a57
--with-fpu=crypto-neon-fp-armv8

I this case, I do not see the regressions.

> I suspect under your new predicates they'd need to be changed to check for
> arm_fp_ok rather than arm_vfp_ok.
Probably, yes.

> We want to avoid removing any PASSes.
> Can you please test some more to make sure we don't remove any legitimate
> passes with your patch?
Is that the only combination in which you saw less PASSes?

> Also, Ramana reminded me offline that any new predicates in
>

Re: [testsuite][ARM target attributes] Fix effective_target tests

On 10/12/15 13:04, Christophe Lyon wrote:

On 10 December 2015 at 13:30, Kyrill Tkachov wrote:

Hi Christophe,

On 08/12/15 11:18, Christophe Lyon wrote:

On 8 December 2015 at 11:50, Kyrill Tkachov
wrote:

Hi Christophe,

On 27/11/15 13:00, Christophe Lyon wrote:

Hi,

This patch fixes the problems I've noticed in the following way:
- do not force -mfloat-abi=softfp in dg-options, to avoid conflicts
when gcc is configured --with-float=hard

- change arm_vfp_ok such that it tries several -mfpu/-mfloat-abi
flags, checks that __ARM_FP is defined and __ARM_NEON_FP is not
defined

- introduce arm_fp_ok, which is similar but does not enforce fpu setting

- attr-neon-builtin-fail.c: use arm_fp to force the appropriate
float-abi setting. Enforcing fpu is not needed here.

- attr-neon3.c: similar

Tested on a variety of configurations, see:

http://people.linaro.org/~christophe.lyon/cross-validation/gcc-test-patches/230929-target-attr/report-build-info.html

Note that the regressions reported fall into 3 categories:
- when forcing march=armv5t: tests are now unsupported because I
modified arm_crypto_ok to require arm_v8_neon_ok instead of arm32.

- the warning reported by attr-neon-builtin-fail.c moved from line 12
to 14 and is thus seen as a regression + one improvement

- finally, attr-neon-fp16.c causes an ICE on armeb compilers, for
which I need to post a bugzilla.

TBH, I'm a bit concerned by the complexity of all these multilib-like
conditions. I'm confident that I'm still missing some combinations :-)

And with new target attributes coming, new architectures etc... all
this logic is likely to become even more complex.

That being said, OK for trunk?

This is ok in the meantime.
Sorry for the delay.

Thanks for the approval, glad to see I'm not the only one willing to see
improvements in this area :)

Committed as r231403.

With this patch we're seeing legitimate PASSes go to NA.
For example:

gcc.target/arm/vfp-1.c
gcc.target/arm/vfp-ldmdbs.c
and other vfp tests in arm.exp.
This is, for example, for the
variant-mthumb/-march=armv8-a/-mfpu=crypto-neon-fp-armv8/-mfloat-abi=hard

Hmm I'm attempting to cover such a configuration in my matrix of validations:
http://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/231403/report-build-info.html

The difference is that I use similar flags at GCC configure time, while you
override them when running the testsuite:
--target arm-none-linux-gnueabihf --with-mode=thum --with-cpu=cortex-a57
--with-fpu=crypto-neon-fp-armv8

I this case, I do not see the regressions.

My gcc is arm-none-eabi configured with
--with-float=hard --with-cpu=cortex-a15 --with-fpu=neon-vfpv4 --with-float=hard

I suspect under your new predicates they'd need to be changed to check for
arm_fp_ok rather than arm_vfp_ok.

Probably, yes.

In the test log where it checks the effective target it fails due to:
arm_vfp_ok27268.c:6:4: error: #error __ARM_NEON_FP defined
it's compiling the check with
-mthumb -march=armv8-a -mfpu=crypto-neon-fp-armv8 -mfloat-abi=hard -o
arm_vfp_ok27268.o arm_vfp_ok27268.c

We want to avoid removing any PASSes.
Can you please test some more to make sure we don't remove any legitimate
passes with your patch?

Is that the only combination in which you saw less PASSes?

That's the one that was brought to my attention, but I think the

Re: [PATCH] Fix up noce_try_abs again (PR rtl-optimization/68670)


On 12/09/2015 11:37 PM, Jakub Jelinek wrote:

Not sure what I've been thinking when writing the previous noce_try_abs fix.
I thought that the optimization can be applied for all the conditions,
and whether it can be applied depends on if it is cond ? ~x : x or
cond ? x : ~x.  But that is not the case, the optimization can be only
applied to a subset of conditions, and when it can be applied, it can be
applied to both the cond ? ~x : x and cond ? x : ~x cases (depending on
the condition one is one_cmpl_abs (x) and the other ~one_cmpl_abs (x)).


Odd, I thought you made a good argument last time :-( Sorry for not 
catching the problem.



switch (GET_CODE (cond))
  {
- case GT:
-   if (!negate)
- return FALSE;
-   break;
  case GE:
-   /* >= 0 is the same case as above > -1.  */
-   if (negate)
- return FALSE;
-   break;
  case LT:
-   if (negate)
- return FALSE;
-   break;
- case LE:
-   /* <= 0 is the same case as above < 1.  */
-   if (!negate)
- return FALSE;
break;
  default:
return FALSE;


This turns into a much simpler if statement, doesn't it? Ok with that 
change.



+/* PR rtl-optimization/68670 */
+/* { dg-do run } */
+/* { dg-options "-O2 -ftracer" } */


Curious what effect -ftracer has here?


Bernd

[patch] Improve generated libstdc++ API docs

2015-12-10 Thread Jonathan Wakely


This adjusts some Doxygen comments and updates the Doxygen config file
to ensure all headers are processed (previously doxygen was ignoring
filenames without an extension, which is a lot of the library!)

There's a workaround in include/std/bitset for the bug 
https://bugzilla.gnome.org/show_bug.cgi?id=759242

and another in include/ext/pb_ds/detail/bin_search_tree_/traits.hpp
for https://bugzilla.gnome.org/show_bug.cgi?id=759241

Tested with 'make doc-man-doxygen doc-html-doxygen' and committed to
trunk.


commit a35e6c19ef0d933e0eb6f182a78dfa2624a25eb7
Author: Jonathan Wakely 
Date:   Wed Dec 9 17:20:26 2015 +

Improve generated libstdc++ API docs

	* doc/doxygen/user.cfg.in: Use EXTENSION_MAPPING tag. Add new headers
	to INPUT. Remove obsolete XML_SCHEMA and XML_DTD tags. Update
	PREDEFINED macros. Set BRIEF_MEMBER_DESC for man-pages.
	* include/backward/strstream: Correct @file comment.
	* include/bits/forward_list.h: Improve Doxygen comments.
	* include/bits/locale_facets_nonio.h: Likewise.
	* include/bits/mutex.h: Likewise.
	* include/bits/shared_ptr.h: Likewise.
	* include/bits/stl_deque.h: Likewise.
	* include/debug/vector (_Safe_vector): Add @brief section to comment.
	* include/experimental/bits/fs_dir.h: Correct @file comment.
	* include/experimental/bits/fs_fwd.h: Likewise.
	* include/experimental/bits/fs_ops.h: Likewise.
	* include/experimental/bits/fs_path.h: Likewise.
	* include/experimental/bits/string_view.tcc: Likewise.
	* include/experimental/optional: Document experimental status.
	* include/experimental/string_view: Correct @file comment.
	* include/ext/pb_ds/detail/bin_search_tree_/traits.hpp: Reduce
	whitespace to avoid Doxygen bug.
	* include/std/bitset: Remove redundant @class Doxygen command. Add
	parentheses to avoid Doxygen bug.
	* include/std/mutex: Improve Doxygen comments.
	* include/tr2/dynamic_bitset: Add missing @param documentation.
	* scripts/run_doxygen: Rename man pages for std::experimental types.

diff --git a/libstdc++-v3/doc/doxygen/user.cfg.in b/libstdc++-v3/doc/doxygen/user.cfg.in
index ff2db48..a62eeff 100644
--- a/libstdc++-v3/doc/doxygen/user.cfg.in
+++ b/libstdc++-v3/doc/doxygen/user.cfg.in
@@ -90,7 +90,7 @@ OUTPUT_LANGUAGE= English
 # documentation (similar to Javadoc). Set to NO to disable this.
 # The default value is: YES.
 
-BRIEF_MEMBER_DESC  = NO
+BRIEF_MEMBER_DESC  = @do_man@
 
 # If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief
 # description of a member or function before the detailed description
@@ -272,7 +272,7 @@ OPTIMIZE_OUTPUT_VHDL   = NO
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
 # the files are not read by doxygen.
 
-EXTENSION_MAPPING  =
+EXTENSION_MAPPING  = no_extension=C++ .h=C++ .tcc=C++ .hpp=C++
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
@@ -757,6 +757,7 @@ INPUT  = @srcdir@/doc/doxygen/doxygroups.cc \
  include/bitset \
  include/chrono \
  include/complex \
+ include/codecvt \
  include/condition_variable \
  include/deque \
  include/forward_list \
@@ -812,6 +813,7 @@ INPUT  = @srcdir@/doc/doxygen/doxygroups.cc \
  include/cmath \
  include/csetjmp \
  include/csignal \
+ include/cstdalign \
  include/cstdarg \
  include/cstdbool \
  include/cstddef \
@@ -822,6 +824,7 @@ INPUT  = @srcdir@/doc/doxygen/doxygroups.cc \
  include/ctgmath \
  include/ctime \
  include/cwchar \
+ include/cuchar \
  include/cwctype \
  include/ \
  include/bits \
@@ -831,6 +834,7 @@ INPUT  = @srcdir@/doc/doxygen/doxygroups.cc \
  include/backward/hash_set \
  include/backward/strstream \
  include/debug \
+ include/debug/array \
  include/debug/bitset \
  include/debug/deque \
  include/debug/forward_list \
@@ -853,6 +857,7 @@ INPUT  = @srcdir@/doc/doxygen/doxygroups.cc \
  include/profile/unordered_set \
  include/profile/vector \
  include/ext/algorithm \
+ include/ext/cmath \

Re: [PATCH PR68542]