http://gcc.gnu.org/bugzilla/show_bug.cgi?id=47895
Summary: usage of __attribute__ ((__target__ ("xyz"))) with buitins Product: gcc Version: 4.6.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end AssignedTo: unassig...@gcc.gnu.org ReportedBy: vincenzo.innoce...@cern.ch I would like to generate code for multiple targets from the same source when using builtins ( I think that this issue has been discussed before for instance in http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39840 ) I have code as in the example below that compiles only with -mavx. In such a case it will use AVX instruction for all functions including the one "targetted" for sse3 while I would like to obtain an object file that I can run on multiple platform. This problem occurs only when builtins are used: standard c code is correctly emitted accordingly to the target provided that the minimal -m is used. Is there any preprocessor flag to "activate" all intrinsics and builtins in x86intrin.h? ----------------------------- example #include <x86intrin.h> float __attribute__ ((__target__ ("sse3"))) sum3(float const * __restrict__ x, float const * __restrict__ y, float const * __restrict__ z) { __m128 sum = _mm_setzero_ps(); for (int i=0; i!=1024; i+=4) sum += _mm_add_ps(_mm_loadu_ps(z+i), _mm_mul_ps(_mm_loadu_ps(x+i),_mm_loadu_ps(y+i)) ); sum = _mm_hadd_ps(sum,sum); sum = _mm_hadd_ps(sum,sum); float ret; _mm_store_ss(&ret,sum); return ret; } float __attribute__ ((__target__ ("avx"))) sumv(float const * __restrict__ x, float const * __restrict__ y, float const * __restrict__ z) { __m256 sum = _mm256_setzero_ps(); for (int i=0; i!=1024; i+=8) sum += _mm256_add_ps(_mm256_loadu_ps(z+i), _mm256_mul_ps(_mm256_loadu_ps(x+i),_mm256_loadu_ps(y+i)) ); sum = _mm256_hadd_ps(sum,sum); sum = _mm256_hadd_ps(sum,sum); sum = _mm256_hadd_ps(sum,sum); float ret[8]; _mm256_store_ps(ret,sum); return ret[0]; }