Re: [PATCH] [i386]Add missing BMI function to align with clang
On Tue, Dec 21, 2021 at 6:22 AM Haochen Jiang wrote: > > Hi all, > > This patch adds missing BMI function _tzcnt_u16, _andn_u32, _andn_u64 to > align with clang. > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > BRs, > Haochen > > gcc/ChangeLog: > > * config/i386/bmiintrin.h (_tzcnt_u16): New define function. > (_andn_u32): Ditto. > (_andn_u64): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/bmi-1.c: Add new test for new define function. > * gcc.target/i386/bmi-2.c: Ditto. > * gcc.target/i386/bmi-3.c: Ditto. OK. Thanks, Uros. > --- > gcc/config/i386/bmiintrin.h | 18 ++ > gcc/testsuite/gcc.target/i386/bmi-1.c | 8 +++- > gcc/testsuite/gcc.target/i386/bmi-2.c | 8 +++- > gcc/testsuite/gcc.target/i386/bmi-3.c | 8 +++- > 4 files changed, 39 insertions(+), 3 deletions(-) > > diff --git a/gcc/config/i386/bmiintrin.h b/gcc/config/i386/bmiintrin.h > index 439d81cba11..92450a644eb 100644 > --- a/gcc/config/i386/bmiintrin.h > +++ b/gcc/config/i386/bmiintrin.h > @@ -40,12 +40,24 @@ __tzcnt_u16 (unsigned short __X) >return __builtin_ia32_tzcnt_u16 (__X); > } > > +extern __inline unsigned short __attribute__((__gnu_inline__, > __always_inline__, __artificial__)) > +_tzcnt_u16 (unsigned short __X) > +{ > + return __builtin_ia32_tzcnt_u16 (__X); > +} > + > extern __inline unsigned int __attribute__((__gnu_inline__, > __always_inline__, __artificial__)) > __andn_u32 (unsigned int __X, unsigned int __Y) > { >return ~__X & __Y; > } > > +extern __inline unsigned int __attribute__((__gnu_inline__, > __always_inline__, __artificial__)) > +_andn_u32 (unsigned int __X, unsigned int __Y) > +{ > + return __andn_u32 (__X, __Y); > +} > + > extern __inline unsigned int __attribute__((__gnu_inline__, > __always_inline__, __artificial__)) > __bextr_u32 (unsigned int __X, unsigned int __Y) > { > @@ -114,6 +126,12 @@ __andn_u64 (unsigned long long __X, unsigned long long > __Y) >return ~__X & __Y; > } > > +extern __inline unsigned long long __attribute__((__gnu_inline__, > __always_inline__, __artificial__)) > +_andn_u64 (unsigned long long __X, unsigned long long __Y) > +{ > + return __andn_u64 (__X, __Y); > +} > + > extern __inline unsigned long long __attribute__((__gnu_inline__, > __always_inline__, __artificial__)) > __bextr_u64 (unsigned long long __X, unsigned long long __Y) > { > diff --git a/gcc/testsuite/gcc.target/i386/bmi-1.c > b/gcc/testsuite/gcc.target/i386/bmi-1.c > index 738705e29d8..141adaac016 100644 > --- a/gcc/testsuite/gcc.target/i386/bmi-1.c > +++ b/gcc/testsuite/gcc.target/i386/bmi-1.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-O2 -fno-ipa-icf -mbmi " } */ > -/* { dg-final { scan-assembler "andn\[^\\n]*eax" } } */ > +/* { dg-final { scan-assembler-times "andn\[^\\n]*eax" 2 } } */ > /* { dg-final { scan-assembler-times "bextr\[ \\t]+\[^\\n]*eax" 2 } } */ > /* { dg-final { scan-assembler-times "blsi\[^\\n]*eax" 2 } } */ > /* { dg-final { scan-assembler-times "blsmsk\[^\\n]*eax" 2 } } */ > @@ -15,6 +15,12 @@ func_andn32 (unsigned int X, unsigned int Y) >return __andn_u32(X, Y); > } > > +unsigned int > +func_andn32_2 (unsigned int X, unsigned int Y) > +{ > + return _andn_u32(X, Y); > +} > + > unsigned int > func_bextr32 (unsigned int X, unsigned int Y) > { > diff --git a/gcc/testsuite/gcc.target/i386/bmi-2.c > b/gcc/testsuite/gcc.target/i386/bmi-2.c > index 6b8595eb9e1..3f9052a4991 100644 > --- a/gcc/testsuite/gcc.target/i386/bmi-2.c > +++ b/gcc/testsuite/gcc.target/i386/bmi-2.c > @@ -1,6 +1,6 @@ > /* { dg-do compile { target { ! ia32 } } } */ > /* { dg-options "-O2 -fno-ipa-icf -mbmi " } */ > -/* { dg-final { scan-assembler "andn\[^\\n]*rax" } } */ > +/* { dg-final { scan-assembler-times "andn\[^\\n]*rax" 2 } } */ > /* { dg-final { scan-assembler-times "bextr\[ \\t]+\[^\\n]*rax" 2 } } */ > /* { dg-final { scan-assembler-times "blsi\[^\\n]*rax" 2 } } */ > /* { dg-final { scan-assembler-times "blsmsk\[^\\n]*rax" 2 } } */ > @@ -15,6 +15,12 @@ func_andn64 (unsigned long long X, unsigned long long Y) >return __andn_u64 (X, Y); > } > > +unsigned long long > +func_andn64_2 (unsigned long long X, unsigned long long Y) > +{ > + return _andn_u64 (X, Y); > +} > + > unsigned long long > func_bextr64 (unsigned long long X, unsigned long long Y) > { > diff --git a/gcc/testsuite/gcc.target/i386/bmi-3.c > b/gcc/testsuite/gcc.target/i386/bmi-3.c > index ddc5e0f66e2..0b91bc25bf8 100644 > --- a/gcc/testsuite/gcc.target/i386/bmi-3.c > +++ b/gcc/testsuite/gcc.target/i386/bmi-3.c > @@ -1,6 +1,6 @@ > /* { dg-do compile } */ > /* { dg-options "-O2 -mbmi " } */ > -/* { dg-final { scan-assembler "tzcntw\[^\\n]*(%|)ax" } } */ > +/* { dg-final { scan-assembler-times "tzcntw\[^\\n]*%?ax" 2 } } */ > > #include > > @@ -9,3 +9,9 @@ func_tzcnt16 (unsigned short X) > { >return __tzcnt_u16(X); > } > + > +unsigned short > +func_tzcnt16_2
Re: [PATCH][Hashtable 6/6] PR 68303 small size optimization
Am Di., 21. Dez. 2021 um 07:08 Uhr schrieb François Dumont via Libstdc++ : > > Hi > > Is there a chance for this patch to be integrated for next gcc > release ? > > François > No counterargument for the acceptance, but: Shouldn't __small_size_threshold() be a noexcept function? - Daniel
[PATCH] [i386] Add define_insn_and_split for vpcmp{b, w, d, q} vpcmp{ph, ps, pd}.
The purpose of those define_insn_and_split: 1. Combine vpcmpuw and zero_extend into vpcmpuw. 2. Canonicalize vpcmpuw pattern so CSE can replace duplicate vpcmpuw to just kmov 3. Use DImode as dest of zero_extend so cprop_hardreg can eliminate redundant kmov. It should partially fix the issue in PR. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ready to push to trunk. gcc/ChangeLog: PR target/103750 * config/i386/sse.md (*_cmp3_zero_extend): New define_insn_and_split. (*_cmp3): Ditto. (*_cmp3_zero_extenddi): New define_insn. (*_cmp3_zero_extend): New define_insn_and_split. (*_ucmp3_zero_extend): Ditto. (*_ucmp3): Ditto. (*_ucmp3_zero_extenddi): New define_insn. (*_ucmp3_zero_extend): New define_insn_and_split. gcc/testsuite/ChangeLog: * gcc.target/i386/bitwise_mask_op-3.c: Adjust test/ * g++.target/i386/pr103750-1.C: New test. --- gcc/config/i386/sse.md| 267 ++ gcc/testsuite/g++.target/i386/pr103750-1.C| 50 .../gcc.target/i386/bitwise_mask_op-3.c | 6 +- 3 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/g++.target/i386/pr103750-1.C diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5196149ee32..fb885d58272 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -3702,6 +3702,75 @@ (define_insn "_cmp3" (set_attr "prefix" "evex") (set_attr "mode" "")]) +;; Those Splitters are used to canonicalize vpcmpuw pattern, so that CSE can transfrom +;; duplicated vpcmpuw to vpcmpuw and kmov +;; Choose biggest mode(DImode) as dest, so kmov can be optimized by cprop_hardreg. +(define_insn_and_split "*_cmp3_zero_extend" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") +(match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") +(match_operand:SI 3 "" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" + "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec: + [(match_dup 1) +(match_dup 2) +(match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +(define_insn_and_split "*_cmp3" + [(set (match_operand: 0 "register_operand" "=k") + (unspec: + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") + (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "" "n")] + UNSPEC_PCMP))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (mode) < 64" + "#" + "&& 1" + [(set (match_dup 0) + (zero_extend:DI + (unspec: + [(match_dup 1) +(match_dup 2) +(match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +(define_insn "*_cmp3_zero_extenddi" + [(set (match_operand:DI 0 "register_operand" "=k") + (zero_extend:DI + (unspec: + [(match_operand:V48H_AVX512VL 1 "register_operand" "v") +(match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") +(match_operand:SI 3 "" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && GET_MODE_NUNITS (mode) < 64" + "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + (define_insn_and_split "*_cmp3" [(set (match_operand: 0 "register_operand") (not: @@ -3735,6 +3804,72 @@ (define_insn "_cmp3" (set_attr "prefix" "evex") (set_attr "mode" "")]) +(define_insn_and_split "*_cmp3_zero_extend" + [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: + [(match_operand:VI12_AVX512VL 1 "register_operand" "v") +(match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") +(match_operand:SI 3 "" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" + "vpcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + "&& mode != E_DImode" + [(set (match_dup 0) + (zero_extend:DI + (unspec: + [(match_dup 1) +(match_dup 2) +(match_dup 3)] + UNSPEC_PCMP)))] + "operands[0] = lowpart_subreg (DImode, operands[0], mode);" + [(set_attr "type" "ssecmp") + (set_attr
Re: [PATCH] elf: Add __libc_get_static_tls_bounds [BZ #16291]
* Fāng-ruì Sòng: >> I *think* you can get what you need via existing GLIBC_PRIVATE >> interfaces. But in order to describe how to caox the data out of glibc, >> I need to know what you need. > > Unfortunate no, not reliably. Currently _dl_get_tls_static_info > (https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp#L207) > is used. It has the size information but not the start address, so the > sanitizer runtime is doing very ugly/error-prone probing of the start > address by reading the thread pointer and hard coding the `struct > pthread` size for various glibc versions. > Every time `struct pthreads` increases in glibc, sanitizer runtime has to > adapt. > > __libc_get_static_tls_bounds if supported by glibc, can replace > _dl_get_tls_static_info and the existing glibc specific GetTls hacks, > and provide a bit more compatibility when glibc struct pthreads > increases. We already export the size of struct pthread, though, as a GLIBC_PRIVATE symbol. const unsigned int *psize = dlvsym("_thread_db_sizeof_pthread", "GLIBC_PRIVATE"); if (psize != nullptr) { val = *psize; atomic_store_relaxed(_descriptor_size, val); return val; } in ThreadDescriptorSize() in compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp should work with any further glibc changes today. It definitely beats hard-coding the architecture-specific struct pthread size. Using _thread_db_sizeof_pthread does not remove all magic constants from the code, and there is of course the _dl_get_tls_static_info symbol usage. This is a fairly recent change (glibc 2.34, I believe) that was introduced to improve thread debugging without debuginfo present in the glibc objects (which is why it's a GLIBC_PRIVATE symbol). libthread_db is bundled with glibc, so the _thread_db_* symbols are not gurantueed to be stable, but there are no immediate plans to change this interface. Thanks, Florian
Re: [PATCH][Hashtable 6/6] PR 68303 small size optimization
Hi Is there a chance for this patch to be integrated for next gcc release ? François On 23/09/21 6:36 am, François Dumont wrote: Ping ? On 16/08/21 9:03 pm, François Dumont wrote: On 17/07/20 2:58 pm, Jonathan Wakely wrote: On 17/11/19 22:31 +0100, François Dumont wrote: This is an implementation of PR 68303. I try to use this idea as much as possible to avoid computation of hash codes. Note that tests are not showing any gain. I guess hash computation must be quite bad to get a benefit from it. So I am only activating it when hash code is not cached and/or when computation is not fast. If the tests don't show any benefit, why bother making the change? I eventually managed to demonstrate this optimization through a performance test case. Does it help the example in the PR? No, the code attached to the PR just show what the user has done to put in place this optim on his side. What I needed was a slow hash code computation compared to the equal operation. I realized that I had to use longer string to achieve this. Moreover making this optim dependant on _Hashtable_traits::__hash_cached was just wrong as we cannot use the cached hash code here as the input is a key instance, not a node. I introduce _Hashtable_hash_traits<_Hash> to offer a single customization point as this optim depends highly on the difference between a hash code computation and a comparison. Maybe I should put it at std namespace scope to ease partial specialization ? Performance test results before the patch: unordered_small_size.cc std::unordered_set: 1st insert 40r 32u 8s 264000112mem 0pf unordered_small_size.cc std::unordered_set: find/erase 22r 22u 0s -191999808mem 0pf unordered_small_size.cc std::unordered_set: 2nd insert 36r 36u 0s 191999776mem 0pf unordered_small_size.cc std::unordered_set: erase key 25r 25u 0s -191999808mem 0pf unordered_small_size.cc std::unordered_set: 1st insert 404r 244u 156s -1989936256mem 0pf unordered_small_size.cc std::unordered_set: find/erase 315r 315u 0s 2061942272mem 0pf unordered_small_size.cc std::unordered_set: 2nd insert 233r 233u 0s -2061942208mem 0pf unordered_small_size.cc std::unordered_set: erase key 299r 298u 0s 2061942208mem 0pf after the patch: unordered_small_size.cc std::unordered_set: 1st insert 41r 33u 7s 264000112mem 0pf unordered_small_size.cc std::unordered_set: find/erase 24r 25u 1s -191999808mem 0pf unordered_small_size.cc std::unordered_set: 2nd insert 34r 34u 0s 191999776mem 0pf unordered_small_size.cc std::unordered_set: erase key 25r 25u 0s -191999808mem 0pf unordered_small_size.cc std::unordered_set: 1st insert 399r 232u 165s -1989936256mem 0pf unordered_small_size.cc std::unordered_set: find/erase 196r 197u 0s 2061942272mem 0pf unordered_small_size.cc std::unordered_set: 2nd insert 221r 222u 0s -2061942208mem 0pf unordered_small_size.cc std::unordered_set: erase key 178r 178u 0s 2061942208mem 0pf libstdc++: Optimize operations on small size hashtable [PR 68303] When hasher is identified as slow and the number of elements is limited in the container use a brute-force loop on those elements to look for a given key using the key_equal functor. For the moment the default threshold below which the container is considered as small is 20. libstdc++-v3/ChangeLog: PR libstdc++/68303 * include/bits/hashtable_policy.h (_Hashtable_hash_traits<_Hash>): New. (_Hash_code_base<>::_M_hash_code(const _Hash_node_value<>&)): New. (_Hashtable_base<>::_M_key_equals): New. (_Hashtable_base<>::_M_equals): Use latter. (_Hashtable_base<>::_M_key_equals_tr): New. (_Hashtable_base<>::_M_equals_tr): Use latter. * include/bits/hashtable.h (_Hashtable<>::__small_size_threshold()): New, use _Hashtable_hash_traits. (_Hashtable<>::find): Loop through elements to look for key if size is lower than __small_size_threshold(). (_Hashtable<>::_M_emplace(true_type, _Args&&...)): Likewise. (_Hashtable<>::_M_insert_unique(_Kt&&, _Args&&, const _NodeGenerator&)): Likewise. (_Hashtable<>::_M_compute_hash_code(const_iterator, const key_type&)): New. (_Hashtable<>::_M_emplace(const_iterator, false_type, _Args&&...)): Use latter. (_Hashtable<>::_M_find_before_node(const key_type&)): New. (_Hashtable<>::_M_erase(true_type, const key_type&)): Use latter. (_Hashtable<>::_M_erase(false_type, const key_type&)): Likewise. * src/c++11/hashtable_c++0x.cc: Include . * testsuite/util/testsuite_performane.h
[PATCH] [i386]Add missing BMI function to align with clang
Hi all, This patch adds missing BMI function _tzcnt_u16, _andn_u32, _andn_u64 to align with clang. Regtested on x86_64-pc-linux-gnu. Ok for trunk? BRs, Haochen gcc/ChangeLog: * config/i386/bmiintrin.h (_tzcnt_u16): New define function. (_andn_u32): Ditto. (_andn_u64): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/bmi-1.c: Add new test for new define function. * gcc.target/i386/bmi-2.c: Ditto. * gcc.target/i386/bmi-3.c: Ditto. --- gcc/config/i386/bmiintrin.h | 18 ++ gcc/testsuite/gcc.target/i386/bmi-1.c | 8 +++- gcc/testsuite/gcc.target/i386/bmi-2.c | 8 +++- gcc/testsuite/gcc.target/i386/bmi-3.c | 8 +++- 4 files changed, 39 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/bmiintrin.h b/gcc/config/i386/bmiintrin.h index 439d81cba11..92450a644eb 100644 --- a/gcc/config/i386/bmiintrin.h +++ b/gcc/config/i386/bmiintrin.h @@ -40,12 +40,24 @@ __tzcnt_u16 (unsigned short __X) return __builtin_ia32_tzcnt_u16 (__X); } +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tzcnt_u16 (unsigned short __X) +{ + return __builtin_ia32_tzcnt_u16 (__X); +} + extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __andn_u32 (unsigned int __X, unsigned int __Y) { return ~__X & __Y; } +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_andn_u32 (unsigned int __X, unsigned int __Y) +{ + return __andn_u32 (__X, __Y); +} + extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __bextr_u32 (unsigned int __X, unsigned int __Y) { @@ -114,6 +126,12 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y) return ~__X & __Y; } +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_andn_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __andn_u64 (__X, __Y); +} + extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __bextr_u64 (unsigned long long __X, unsigned long long __Y) { diff --git a/gcc/testsuite/gcc.target/i386/bmi-1.c b/gcc/testsuite/gcc.target/i386/bmi-1.c index 738705e29d8..141adaac016 100644 --- a/gcc/testsuite/gcc.target/i386/bmi-1.c +++ b/gcc/testsuite/gcc.target/i386/bmi-1.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-O2 -fno-ipa-icf -mbmi " } */ -/* { dg-final { scan-assembler "andn\[^\\n]*eax" } } */ +/* { dg-final { scan-assembler-times "andn\[^\\n]*eax" 2 } } */ /* { dg-final { scan-assembler-times "bextr\[ \\t]+\[^\\n]*eax" 2 } } */ /* { dg-final { scan-assembler-times "blsi\[^\\n]*eax" 2 } } */ /* { dg-final { scan-assembler-times "blsmsk\[^\\n]*eax" 2 } } */ @@ -15,6 +15,12 @@ func_andn32 (unsigned int X, unsigned int Y) return __andn_u32(X, Y); } +unsigned int +func_andn32_2 (unsigned int X, unsigned int Y) +{ + return _andn_u32(X, Y); +} + unsigned int func_bextr32 (unsigned int X, unsigned int Y) { diff --git a/gcc/testsuite/gcc.target/i386/bmi-2.c b/gcc/testsuite/gcc.target/i386/bmi-2.c index 6b8595eb9e1..3f9052a4991 100644 --- a/gcc/testsuite/gcc.target/i386/bmi-2.c +++ b/gcc/testsuite/gcc.target/i386/bmi-2.c @@ -1,6 +1,6 @@ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-O2 -fno-ipa-icf -mbmi " } */ -/* { dg-final { scan-assembler "andn\[^\\n]*rax" } } */ +/* { dg-final { scan-assembler-times "andn\[^\\n]*rax" 2 } } */ /* { dg-final { scan-assembler-times "bextr\[ \\t]+\[^\\n]*rax" 2 } } */ /* { dg-final { scan-assembler-times "blsi\[^\\n]*rax" 2 } } */ /* { dg-final { scan-assembler-times "blsmsk\[^\\n]*rax" 2 } } */ @@ -15,6 +15,12 @@ func_andn64 (unsigned long long X, unsigned long long Y) return __andn_u64 (X, Y); } +unsigned long long +func_andn64_2 (unsigned long long X, unsigned long long Y) +{ + return _andn_u64 (X, Y); +} + unsigned long long func_bextr64 (unsigned long long X, unsigned long long Y) { diff --git a/gcc/testsuite/gcc.target/i386/bmi-3.c b/gcc/testsuite/gcc.target/i386/bmi-3.c index ddc5e0f66e2..0b91bc25bf8 100644 --- a/gcc/testsuite/gcc.target/i386/bmi-3.c +++ b/gcc/testsuite/gcc.target/i386/bmi-3.c @@ -1,6 +1,6 @@ /* { dg-do compile } */ /* { dg-options "-O2 -mbmi " } */ -/* { dg-final { scan-assembler "tzcntw\[^\\n]*(%|)ax" } } */ +/* { dg-final { scan-assembler-times "tzcntw\[^\\n]*%?ax" 2 } } */ #include @@ -9,3 +9,9 @@ func_tzcnt16 (unsigned short X) { return __tzcnt_u16(X); } + +unsigned short +func_tzcnt16_2 (unsigned short X) +{ + return _tzcnt_u16(X); +} -- 2.18.1
Re: [PATCH v8 2/2] Don't move cold code out of loop by checking bb count
On 2021/12/20 15:29, Richard Biener wrote: > On Wed, Dec 8, 2021 at 7:32 AM Xionghu Luo wrote: >> >> >> >> On 2021/12/7 20:17, Richard Biener wrote: > + class loop *coldest_loop = coldest_outermost_loop[loop->num]; > + if (loop_depth (coldest_loop) < loop_depth (outermost_loop)) > +{ > + class loop *hotter_loop = hotter_than_inner_loop[loop->num]; > + if (!hotter_loop > + || loop_depth (hotter_loop) < loop_depth (outermost_loop)) > + return outermost_loop; > + > + /* hotter_loop is between OUTERMOST_LOOP and LOOP like: > + [loop tree root, ..., coldest_loop, ..., outermost_loop, ..., > + hotter_loop, second_coldest_loop, ..., loop] > + return second_coldest_loop to be the hoist target. */ > + class loop *aloop; > + for (aloop = hotter_loop->inner; aloop; aloop = aloop->next) > + if (flow_loop_nested_p (aloop, loop)) should be: if (aloop == loop || flow_loop_nested_p (aloop, loop)) >>> OK with that fixed. >>> >>> Are necessary prerequesites committed to avoid regressions? >>> I guess we need to keep a watchful eye and eventually revert >>> (or gate with a --param disabled by default) the new behavior if >>> severe regressions are discovered. >>> >>> Thanks and sorry for the repeated delays. >>> Richard. >>> >> >> Thanks for your review, I learned quite a lot and gained very useful >> comments & help through the period :) There are still 3 patches required >> to avoid regression or so, I've reorganized them and sent it out. >> >> https://gcc.gnu.org/pipermail/gcc-patches/2021-December/586371.html >> >> >> In addition, cooked the patch to add option for disable/enable it. >> Is it OK to merge it to current patch? > > Hmm, let's go without this flag for now, we can add something like this > when we see a testcase where that makes sense (and where profile info > is not wrecked by other bugs). > > Adding a --param would be a no brainer, but for new user-facing options > we should be more careful. > Thanks. Committed to r12-6087. -- Thanks, Xionghu
Re: [PATCH 3/3] Fix loop split incorrect count and probability
On 2021/12/13 16:57, Xionghu Luo via Gcc-patches wrote: > > > On 2021/12/9 07:47, Jeff Law wrote: >>> diff --git a/gcc/tree-ssa-loop-split.c b/gcc/tree-ssa-loop-split.c >>> index 3f6ad046623..33128061aab 100644 >>> --- a/gcc/tree-ssa-loop-split.c >>> +++ b/gcc/tree-ssa-loop-split.c >>> >>> @@ -607,6 +610,38 @@ split_loop (class loop *loop1) >>> tree guard_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge >>> (loop1)); >>> patch_loop_exit (loop1, guard_stmt, guard_next, newend, >>> initial_true); >>> + update_ssa (TODO_update_ssa); >>> + >>> + /* Proportion first loop's bb counts except those dominated by true >>> + branch to avoid drop 1s down. */ >>> + basic_block *bbs1, *bbs2; >>> + bbs1 = get_loop_body (loop1); >>> + unsigned j; >>> + for (j = 0; j < loop1->num_nodes; j++) >>> + if (bbs1[j] == loop1->latch >>> + || !dominated_by_p (CDI_DOMINATORS, bbs1[j], true_edge->dest)) >>> + bbs1[j]->count >>> + = bbs1[j]->count.apply_probability (true_edge->probability); >>> + free (bbs1); >> It looks like there's two copies of this code in this patch, one in >> split_loop and the other in do_split_loop_on_cond. Would it make sense >> to factor it out into its own little function? >> >> >>> + >>> + /* Proportion second loop's bb counts except those dominated by >>> false >>> + branch to avoid drop 1s down. */ >>> + basic_block bbi_copy = get_bb_copy (false_edge->dest); >>> + bbs2 = get_loop_body (loop2); >>> + for (j = 0; j < loop2->num_nodes; j++) >>> + if (bbs2[j] == loop2->latch >>> + || !dominated_by_p (CDI_DOMINATORS, bbs2[j], bbi_copy)) >>> + bbs2[j]->count = bbs2[j]->count.apply_probability ( >>> + true_edge->probability.invert ()); >>> + free (bbs2); >> Similarly for this block of code. >> >> If those can be reasonably factored out into two helper functions to be >> called from split_loop and do_split_loop_on_cond, then this is OK with >> the refactoring. >> >> jeff > > > Thanks for the comments, updated as below. Will commit this patchset and the > approved patch for LIM if there are no objections: This patch is committed to r12-6086. > > > [PATCH v2 3/3] Fix loop split incorrect count and probability > > In tree-ssa-loop-split.c, split_loop and split_loop_on_cond does two > kind of split. split_loop only works for single loop and insert edge at > exit when split, while split_loop_on_cond is not limited to single loop > and insert edge at latch when split. Both split behavior should consider > loop count and probability update. For split_loop, loop split condition > is moved in front of loop1 and loop2; But split_loop_on_cond moves the > condition between loop1 and loop2, this patch does: > 1) profile count proportion for both original loop and copied loop > without dropping down the true branch's count; > 2) probability update in the two loops and between the two loops. > > Regression tested pass, OK for master? > > Changes diff for split_loop and split_loop_on_cond cases: > > 1) diff base/loop-split.c.151t.lsplit patched/loop-split.c.152t.lsplit > ... > [local count: 118111600]: >_1 = end_6(D) - beg_7(D); >j_9 = _1 + beg2_8(D); >if (end_6(D) > beg_7(D)) > goto ; [89.00%] >else > goto ; [11.00%] > > [local count: 105119324]: >if (j_9 >= c_11(D)) > -goto ; [100.00%] > +goto ; [33.00%] >else > -goto ; [100.00%] > +goto ; [67.00%] > > - [local count: 105119324]: > + [local count: 34689377]: >_27 = end_6(D) + -1; >_28 = beg_7(D) - end_6(D); >_29 = j_9 + _28; >_30 = _29 + 1; >_31 = MAX_EXPR ; > > - [local count: 955630225]: > + [local count: 315357973]: ># i_18 = PHI ># j_19 = PHI >printf ("a: %d %d\n", i_18, j_19); >i_13 = i_18 + -1; >j_14 = j_19 + -1; >if (j_14 >= _31) > -goto ; [89.00%] > +goto ; [29.37%] >else > -goto ; [11.00%] > +goto ; [70.63%] > > - [local count: 850510901]: > + [local count: 280668596]: >goto ; [100.00%] > > - [local count: 105119324]: > + [local count: 70429947]: ># i_24 = PHI ># j_25 = PHI > > [local count: 955630225]: ># i_3 = PHI ># j_2 = PHI >i_22 = i_3 + -1; >j_23 = j_2 + -1; >if (beg_7(D) < i_22) > goto ; [89.00%] >else > goto ; [11.00%] > > - [local count: 850510901]: > + [local count: 569842305]: >goto ; [100.00%] > > [local count: 105119324]: ># i_32 = PHI ># j_33 = PHI >if (beg_7(D) < i_32) > goto ; [80.00%] >else > goto ; [20.00%] > > [local count: 105119324]: > > [local count: 118111600]: >return 0; > > } > > 2) diff base/loop-cond-split-1.c.151t.lsplit > patched/loop-cond-split-1.c.151t.lsplit: > ... > [local count: 118111600]: >if (n_7(D) > 0) > goto ; [89.00%] >else > goto ; [11.00%] > > [local count: 118111600]: >return; > > [local
Re: [PATCH 2/3] Fix incorrect loop exit edge probability [PR103270]
On 2021/12/16 19:18, Jan Hubicka wrote: >>> >>> >>> ./contrib/analyze_brprob.py ~/workspace/tests/spec2017/dump_file_all >>> HEURISTICS BRANCHES (REL) BR. HITRATE >>> HITRATE COVERAGE COVERAGE (REL) predict.def (REL) HOT >>> branches (>10%) >>> noreturn call 1 0.0% 100.00% >>> 50.00% / 50.00% 2 2.00 0.0% 100%:1 >>> Fortran zero-sized array3 0.0% 66.67% >>> 41.71% / 60.50%362 362.00 0.0% 100%:3 >>> loop iv compare16 0.0% 93.75% >>> 98.26% / 98.76% 279847 279.85k 0.0% 93%:4 >>> __builtin_expect 35 0.0% 97.14% >>> 78.09% / 78.35% 17079558 17.08M 0.0% >>> loop guard with recursion 45 0.1% 86.67% >>> 85.13% / 85.14% 67224244126.72G 1.3% 74%:4 >>> extra loop exit80 0.1% 58.75% >>> 81.49% / 89.21% 438470261 438.47M 0.1% 86%:3 >>> guess loop iv compare 235 0.3% 80.85% >>> 52.83% / 73.97% 148558247 148.56M 0.0% 47%:3 >>> negative return 241 0.3% 71.37% >>> 25.33% / 92.61% 250402383 250.40M 0.0% 69%:2 >>> loop exit with recursion 315 0.4% 74.60% >>> 85.07% / 85.71% 94031368589.40G 1.8% 59%:4 >>> const return 320 0.4% 51.88% >>> 90.45% / 95.63% 925341727 925.34M 0.2% 76%:5 >>> indirect call 377 0.5% 51.46% >>> 84.72% / 91.14% 21337728482.13G 0.4% 69%:1 >>> polymorphic call 410 0.5% 44.15% >>> 31.26% / 79.37% 32726882443.27G 0.6% 53%:2 >>> recursive call506 0.7% 39.53% >>> 44.97% / 83.92% 12110368061.21G 0.2% 10%:1 >>> goto 618 0.8% 64.24% >>> 65.37% / 83.57% 702446178 702.45M 0.1% 20%:1 >>> null return 800 1.1% 64.62% >>> 56.59% / 77.70% 603952067 603.95M 0.1% 28%:2 >>> continue 956 1.3% 63.70% >>> 65.65% / 79.97% 37803037993.78G 0.7% 52%:3 >>> loop guard 1177 1.6% 56.33% >>> 42.54% / 80.32% 73736014577.37G 1.4% 50%:2 >>> opcode values positive (on trees)2020 2.7% 62.38% >>> 64.16% / 84.44%31695571761 31.70G 6.0% 21%:2 >>> loop exit3293 4.4% 76.19% >>> 87.18% / 88.35%50377138963 50.38G 9.6% 18%:1 >>> loop iterations 4761 6.3% 99.98% >>> 84.27% / 84.27%73463634555 73.46G 13.9% >>> pointer (on trees) 8076 10.7% 56.23% >>> 69.36% / 83.15%1232201 12.32G 2.3% >>> call11396 15.1% 64.14% >>> 74.13% / 89.82%25197949198 25.20G 4.8% 34%:1 >>> opcode values nonequal (on trees) 12237 16.3% 70.70% >>> 70.86% / 83.54%36638772333 36.64G 6.9% >>> guessed loop iterations 16760 22.3% 99.78% >>> 91.49% / 91.49% 162952747918 162.95G 30.9% >>> >>> HEURISTICS BRANCHES (REL) BR. HITRATE >>> HITRATE COVERAGE COVERAGE (REL) predict.def (REL) HOT >>> branches (>10%) >>> no prediction 12730 16.9% 39.29% >>> 33.32% / 79.93% 121106031835 121.11G 23.0% >>> first match 25261 33.6% 92.17% >>> 88.33% / 88.98% 296652487962 296.65G 56.3% >>> DS theory 28333 37.7% 63.03% >>> 72.05% / 85.00% 109563734005 109.56G 20.8% >>> combined75232 100.0% 73.17% >>> 72.32% / 86.08% 527351738575 527.35G 100.0% >>> >>> Loop count: 37870 >>> avg. # of iter: 8444.77 >>> median # of iter: 7.00 >>> avg. (1% cutoff) # of iter: 174.68 >>> avg. (5% cutoff) # of iter: 55.14 >>> avg. (10% cutoff) # of iter: 35.21 >>> avg. (20% cutoff) # of iter: 26.23 >>> avg. (30% cutoff) # of iter: 21.70 >> >> This is the output data collected without the patch, as can be seen, no >> difference on "extra loop exit". >> But
Re: [PATCH] rs6000: Replace UNSPECS with ss_plus/us_plus and ss_minus/us_minus
On 2021/12/21 09:32, David Edelsohn wrote: > Explicit clobbers like this help one side of the issue. For vscr, other > than the sat bit there is only the nj bit, and we just ignore that :-) > >> This patch is okay. Thanks for updating the machine description and >> for cleaning up the formatting. > x2. Thanks! Committed to r12-6084. -- Thanks, Xionghu
Re: [PATCH] elf: Add __libc_get_static_tls_bounds [BZ #16291]
On Mon, Nov 29, 2021 at 11:30 AM Florian Weimer wrote: > > * Fāng-ruì Sòng: > > > PING^3 > > I think the core issue with this patch is like this: > > * I do not want to commit glibc to a public API that disallows future > changes to the way we allocate static TLS. While static TLS objects > cannot move in memory, the extent of the static TLS area (minimum and > maximum address) is not fixed by ABI necessities. > > * Joseph does not want to add a GLIBC_PRIVATE ABI that is exclusively > used externally. > > I have tried repeatly to wrap my head around how the sanitizers use the > static TLS boundary information. Based on what I can tell, there are > several applications: > > 1. Thead Sanitizer needs to know application-accessible thread-specific >memory that is carried via the glibc thread (stack) cache from one >thread to another one, seemingly without synchronization. Since the >synchronization happens internally within glibc, without that extra >knowledge, Thread Sanitizer would report a false positive. This >covers only data to which the application has direct access, internal >access by glibc does not count because it is not going to be >instrumented. > > 2. Address Sanitizer needs TLS boundary information for bounds checking. >Again this only applies to accesses that can be instrumented, so only >objects whose addresses leak to application code count. (Maybe this >is a fringe use case, though: it seems to apply only to “extern >__thread int a[];“ and similar declarations, where the declared type >is not enough.) > > 3. Leak Sanitizer needs to find all per-thread pointers pointing into >the malloc heap, within memory not itself allocated by malloc. This >includes the DTV, pthread_getspecific metadata, and perhaps also user >data set by pthread_getspecific, and of course static TLS variables. >This goes someone beyond what people would usually consider static >TLS: the DTV and struct pthread are not really part of static TLS as >such. And struct pthread has several members that contain malloc'ed >pointers. > > Is this a complete list of uses? Perhaps add HWAddressSanitizer along with Address Sanitizer and Memory Sanitizer along with Thread Sanitizer. Then I think the list is complete. > I *think* you can get what you need via existing GLIBC_PRIVATE > interfaces. But in order to describe how to caox the data out of glibc, > I need to know what you need. Unfortunate no, not reliably. Currently _dl_get_tls_static_info (https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp#L207) is used. It has the size information but not the start address, so the sanitizer runtime is doing very ugly/error-prone probing of the start address by reading the thread pointer and hard coding the `struct pthread` size for various glibc versions. Every time `struct pthreads` increases in glibc, sanitizer runtime has to adapt. __libc_get_static_tls_bounds if supported by glibc, can replace _dl_get_tls_static_info and the existing glibc specific GetTls hacks, and provide a bit more compatibility when glibc struct pthreads increases. > (Cc:ing a few people from a recent GCC thread.) > > Thanks, > Florian > -- 宋方睿
Re: [PATCH] rs6000: Replace UNSPECS with ss_plus/us_plus and ss_minus/us_minus
On 2021/12/21 10:19, Xionghu Luo via Gcc-patches wrote: > > > On 2021/12/21 09:32, David Edelsohn wrote: >> On Mon, Dec 20, 2021 at 6:55 PM Segher Boessenkool >> wrote: >>> >>> On Mon, Dec 20, 2021 at 11:45:45AM -0500, David Edelsohn wrote: On Mon, Dec 20, 2021 at 3:24 AM Xionghu Luo wrote: > These four UNSPECS seems could be replaced with native RTL, and why > "(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))" > in the RTL pattern, per ISA of VSCR bit 127(VECTOR Saturation, SAT): > > This bit is sticky; that is, once set to 1 it > remains set to 1 until it is set to 0 by an > mtvscr instruction. > > The RTL pattern set it to 0 but final ASM doesn't present it? And why > not use Clobber VSCR_REGNO instead? The design came from the early implementation of Altivec: https://gcc.gnu.org/pipermail/gcc-patches/2002-May/077409.html > > Thanks, I constructed a testcase for this, > > cat vadds.c > #include > #include > vector signed int foo1 (vector signed int a, vector signed int c) { > vector signed int ret = vec_vaddsws (a, c); > union {vector unsigned short v; unsigned short s[8];} u; > u.v = vec_mfvscr(); > printf("foo1: vscr == { %d,%d,%d,%d,%d,%d,%d,%d }\n", u.s[0], u.s[1], > u.s[2], > u.s[3], u.s[4], u.s[5], u.s[6], u.s[7]); > return a; > } > > vector signed int foo2 (vector signed int a, vector signed int c) { > vector signed int ret = vec_vaddsws (a, c); > union {vector unsigned short v; unsigned short s[8];} u; > u.v = vec_mfvscr(); > printf("foo2: vscr == { %d,%d,%d,%d,%d,%d,%d,%d }\n", u.s[0], u.s[1], > u.s[2], > u.s[3], u.s[4], u.s[5], u.s[6], u.s[7]); > return ret; > } > > int main () > { > vector signed int a = {0x12345678, 0x22345678, 0x32345678, 0x42345678}; > vector signed int c = {0x12345678, 0x22345678, 0x32345678, 0x42345678}; > vector signed int b1 = foo1 (a, c); > vector signed int b2 = foo2 (a, c); > return b1[0]+b2[0]; > } > > The output is: > > ./a.out > foo1: vscr == { 0,0,0,0,0,0,0,0 } > foo2: vscr == { 1,0,0,0,0,0,0,0 } > > > So is this expected behavior? Seems doesn't meet with Aldy's > description... (foo1's temp is optimized away so no vaddsws produced) Just realized that foo1's "ret" is optimized way in gimple already, so no such RTL and vaddsws generated in foo1, only foo2's vaddsws will set VSCR explicitly. > > " foo() > { > vector int result; > > result = vec_adds(blah, blah); > __check_for_saturation__ > } > > gcc, will optimize away vec_adds() because the result is a local > variable unused later. then when we check the saturation bit in VSCR, > we get wrong results. > > this patch explains to gcc all about VSCR, and adds it as a global > register as well." > > If one later checks for saturation (reads VSCR), one needs a corresponding SET of the value. It's set in an architecture-specific manner that isn't described to GCC, but it's set, not just clobbered and in an undefined state. >>> >>> Well. RTL clobber and set do exactly the same thing, except with >>> clobber it is not specified *what* value is set. All bits are set, all >>> bits are defined. There is no (direct) way in RTL to say >>> "undetermined". >>> >>> An RTL clobber would work just fine afaics? >> >> I don't know about the original intention from Aldy, but if one were >> looking at an RTL dump and the code used the saturation bit from VSCR, >> it might be confusing to see a CLOBBER instead of a SET. The SET >> documents that VSCR_REGNO is assigned a specific value; GCC doesn't >> know about the semantics, but it's not some undefined bit pattern. >> CLOBBER implies a trash value or a value that one will not query >> later, i.e., one would want to SET the register to a specific value >> before using it. Agree that SET is better than CLOBBER. Thank you! >> >>> The RTL does not describe that VSCR is set to the value 0. The (const_int 0) is not the value set. You can think of the (const_int 0) as a dummy RTL argument to the VSCR UNSPEC. UNSPEC requires at least one argument and the pattern doesn't try to express the argument, so it uses a dummy RTL constant. >>> >>> Yup. Traditionally (pc) was used for this. Nowadays (const_int 0) is >>> not really more expensive anymore, and many people find it clearer (but >>> not in this case it seems :-) ). >>> It's part of a PARALLEL and the plus or minus already expresses the data dependency of the pattern on the input operands. >>> >>> But they do not describe any dependency on vscr, or output to it. This >>> is the same problem we have with fpscr (most FP insns use some of its >>> fields, most set some, but there is no way to cleanly express that). >> >> It describes that VSCR_REGNO is set, an output. It doesn't describe >> how it is set nor inform the compiler that the value depends on the >> input operands in some
Re: [PATCH] rs6000: Replace UNSPECS with ss_plus/us_plus and ss_minus/us_minus
On 2021/12/21 09:32, David Edelsohn wrote: > On Mon, Dec 20, 2021 at 6:55 PM Segher Boessenkool > wrote: >> >> On Mon, Dec 20, 2021 at 11:45:45AM -0500, David Edelsohn wrote: >>> On Mon, Dec 20, 2021 at 3:24 AM Xionghu Luo wrote: These four UNSPECS seems could be replaced with native RTL, and why "(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))" in the RTL pattern, per ISA of VSCR bit 127(VECTOR Saturation, SAT): This bit is sticky; that is, once set to 1 it remains set to 1 until it is set to 0 by an mtvscr instruction. The RTL pattern set it to 0 but final ASM doesn't present it? And why not use Clobber VSCR_REGNO instead? >>> >>> The design came from the early implementation of Altivec: >>> >>> https://gcc.gnu.org/pipermail/gcc-patches/2002-May/077409.html Thanks, I constructed a testcase for this, cat vadds.c #include #include vector signed int foo1 (vector signed int a, vector signed int c) { vector signed int ret = vec_vaddsws (a, c); union {vector unsigned short v; unsigned short s[8];} u; u.v = vec_mfvscr(); printf("foo1: vscr == { %d,%d,%d,%d,%d,%d,%d,%d }\n", u.s[0], u.s[1], u.s[2], u.s[3], u.s[4], u.s[5], u.s[6], u.s[7]); return a; } vector signed int foo2 (vector signed int a, vector signed int c) { vector signed int ret = vec_vaddsws (a, c); union {vector unsigned short v; unsigned short s[8];} u; u.v = vec_mfvscr(); printf("foo2: vscr == { %d,%d,%d,%d,%d,%d,%d,%d }\n", u.s[0], u.s[1], u.s[2], u.s[3], u.s[4], u.s[5], u.s[6], u.s[7]); return ret; } int main () { vector signed int a = {0x12345678, 0x22345678, 0x32345678, 0x42345678}; vector signed int c = {0x12345678, 0x22345678, 0x32345678, 0x42345678}; vector signed int b1 = foo1 (a, c); vector signed int b2 = foo2 (a, c); return b1[0]+b2[0]; } The output is: ./a.out foo1: vscr == { 0,0,0,0,0,0,0,0 } foo2: vscr == { 1,0,0,0,0,0,0,0 } So is this expected behavior? Seems doesn't meet with Aldy's description... (foo1's temp is optimized away so no vaddsws produced) " foo() { vector int result; result = vec_adds(blah, blah); __check_for_saturation__ } gcc, will optimize away vec_adds() because the result is a local variable unused later. then when we check the saturation bit in VSCR, we get wrong results. this patch explains to gcc all about VSCR, and adds it as a global register as well." >>> >>> If one later checks for saturation (reads VSCR), one needs a >>> corresponding SET of the value. It's set in an architecture-specific >>> manner that isn't described to GCC, but it's set, not just clobbered >>> and in an undefined state. >> >> Well. RTL clobber and set do exactly the same thing, except with >> clobber it is not specified *what* value is set. All bits are set, all >> bits are defined. There is no (direct) way in RTL to say >> "undetermined". >> >> An RTL clobber would work just fine afaics? > > I don't know about the original intention from Aldy, but if one were > looking at an RTL dump and the code used the saturation bit from VSCR, > it might be confusing to see a CLOBBER instead of a SET. The SET > documents that VSCR_REGNO is assigned a specific value; GCC doesn't > know about the semantics, but it's not some undefined bit pattern. > CLOBBER implies a trash value or a value that one will not query > later, i.e., one would want to SET the register to a specific value > before using it. > >> >>> The RTL does not describe that VSCR is set to the value 0. The >>> (const_int 0) is not the value set. You can think of the (const_int >>> 0) as a dummy RTL argument to the VSCR UNSPEC. UNSPEC requires at >>> least one argument and the pattern doesn't try to express the >>> argument, so it uses a dummy RTL constant. >> >> Yup. Traditionally (pc) was used for this. Nowadays (const_int 0) is >> not really more expensive anymore, and many people find it clearer (but >> not in this case it seems :-) ). >> >>> It's part of a PARALLEL >>> and the plus or minus already expresses the data dependency of the >>> pattern on the input operands. >> >> But they do not describe any dependency on vscr, or output to it. This >> is the same problem we have with fpscr (most FP insns use some of its >> fields, most set some, but there is no way to cleanly express that). > > It describes that VSCR_REGNO is set, an output. It doesn't describe > how it is set nor inform the compiler that the value depends on the > input operands in some complicated way unknown to the compiler, but > the compiler cannot do anything useful with the additional > information. > >> >> Explicit clobbers like this help one side of the issue. For vscr, other >> than the sat bit there is only the nj bit, and we just ignore that :-) >> >>> This patch is okay. Thanks for updating the machine description and >>> for cleaning up the formatting. >> >> x2. Thanks! >> >> >> Segher -- Thanks, Xionghu
Re: [PATCH] rs6000: Replace UNSPECS with ss_plus/us_plus and ss_minus/us_minus
On Mon, Dec 20, 2021 at 6:55 PM Segher Boessenkool wrote: > > On Mon, Dec 20, 2021 at 11:45:45AM -0500, David Edelsohn wrote: > > On Mon, Dec 20, 2021 at 3:24 AM Xionghu Luo wrote: > > > These four UNSPECS seems could be replaced with native RTL, and why > > > "(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))" > > > in the RTL pattern, per ISA of VSCR bit 127(VECTOR Saturation, SAT): > > > > > > This bit is sticky; that is, once set to 1 it > > > remains set to 1 until it is set to 0 by an > > > mtvscr instruction. > > > > > > The RTL pattern set it to 0 but final ASM doesn't present it? And why > > > not use Clobber VSCR_REGNO instead? > > > > The design came from the early implementation of Altivec: > > > > https://gcc.gnu.org/pipermail/gcc-patches/2002-May/077409.html > > > > If one later checks for saturation (reads VSCR), one needs a > > corresponding SET of the value. It's set in an architecture-specific > > manner that isn't described to GCC, but it's set, not just clobbered > > and in an undefined state. > > Well. RTL clobber and set do exactly the same thing, except with > clobber it is not specified *what* value is set. All bits are set, all > bits are defined. There is no (direct) way in RTL to say > "undetermined". > > An RTL clobber would work just fine afaics? I don't know about the original intention from Aldy, but if one were looking at an RTL dump and the code used the saturation bit from VSCR, it might be confusing to see a CLOBBER instead of a SET. The SET documents that VSCR_REGNO is assigned a specific value; GCC doesn't know about the semantics, but it's not some undefined bit pattern. CLOBBER implies a trash value or a value that one will not query later, i.e., one would want to SET the register to a specific value before using it. > > > The RTL does not describe that VSCR is set to the value 0. The > > (const_int 0) is not the value set. You can think of the (const_int > > 0) as a dummy RTL argument to the VSCR UNSPEC. UNSPEC requires at > > least one argument and the pattern doesn't try to express the > > argument, so it uses a dummy RTL constant. > > Yup. Traditionally (pc) was used for this. Nowadays (const_int 0) is > not really more expensive anymore, and many people find it clearer (but > not in this case it seems :-) ). > > > It's part of a PARALLEL > > and the plus or minus already expresses the data dependency of the > > pattern on the input operands. > > But they do not describe any dependency on vscr, or output to it. This > is the same problem we have with fpscr (most FP insns use some of its > fields, most set some, but there is no way to cleanly express that). It describes that VSCR_REGNO is set, an output. It doesn't describe how it is set nor inform the compiler that the value depends on the input operands in some complicated way unknown to the compiler, but the compiler cannot do anything useful with the additional information. > > Explicit clobbers like this help one side of the issue. For vscr, other > than the sat bit there is only the nj bit, and we just ignore that :-) > > > This patch is okay. Thanks for updating the machine description and > > for cleaning up the formatting. > > x2. Thanks! > > > Segher
Re: [PATCH] rs6000: Replace UNSPECS with ss_plus/us_plus and ss_minus/us_minus
On Mon, Dec 20, 2021 at 11:45:45AM -0500, David Edelsohn wrote: > On Mon, Dec 20, 2021 at 3:24 AM Xionghu Luo wrote: > > These four UNSPECS seems could be replaced with native RTL, and why > > "(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))" > > in the RTL pattern, per ISA of VSCR bit 127(VECTOR Saturation, SAT): > > > > This bit is sticky; that is, once set to 1 it > > remains set to 1 until it is set to 0 by an > > mtvscr instruction. > > > > The RTL pattern set it to 0 but final ASM doesn't present it? And why > > not use Clobber VSCR_REGNO instead? > > The design came from the early implementation of Altivec: > > https://gcc.gnu.org/pipermail/gcc-patches/2002-May/077409.html > > If one later checks for saturation (reads VSCR), one needs a > corresponding SET of the value. It's set in an architecture-specific > manner that isn't described to GCC, but it's set, not just clobbered > and in an undefined state. Well. RTL clobber and set do exactly the same thing, except with clobber it is not specified *what* value is set. All bits are set, all bits are defined. There is no (direct) way in RTL to say "undetermined". An RTL clobber would work just fine afaics? > The RTL does not describe that VSCR is set to the value 0. The > (const_int 0) is not the value set. You can think of the (const_int > 0) as a dummy RTL argument to the VSCR UNSPEC. UNSPEC requires at > least one argument and the pattern doesn't try to express the > argument, so it uses a dummy RTL constant. Yup. Traditionally (pc) was used for this. Nowadays (const_int 0) is not really more expensive anymore, and many people find it clearer (but not in this case it seems :-) ). > It's part of a PARALLEL > and the plus or minus already expresses the data dependency of the > pattern on the input operands. But they do not describe any dependency on vscr, or output to it. This is the same problem we have with fpscr (most FP insns use some of its fields, most set some, but there is no way to cleanly express that). Explicit clobbers like this help one side of the issue. For vscr, other than the sat bit there is only the nj bit, and we just ignore that :-) > This patch is okay. Thanks for updating the machine description and > for cleaning up the formatting. x2. Thanks! Segher
Re: [PATCH, rs6000] Implement mffscrni pattern
Hi! On Mon, Dec 20, 2021 at 01:55:51PM +0800, HAO CHEN GUI wrote: > * config/rs6000/rs6000-call.c > (rs6000_expand_set_fpscr_rn_builtin): Not copy argument to a reg if > it's a constant. The pattern for constant can be recognized now. (Two spaces after full stop). > +;; Return 1 if op is an unsigned 2-bit constant integer. > +(define_predicate "u2bit_cint_operand" > + (and (match_code "const_int") > + (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 3"))) Simple project for anyone: use IN_RANGE more :-) > --- a/gcc/config/rs6000/rs6000.md > +++ b/gcc/config/rs6000/rs6000.md > @@ -177,6 +177,7 @@ (define_c_enum "unspecv" > UNSPECV_MFFS ; Move from FPSCR > UNSPECV_MFFSL ; Move from FPSCR light instruction version > UNSPECV_MFFSCRN ; Move from FPSCR float rounding mode > + UNSPECV_MFFSCRNI ; Move from FPSCR float rounding mode with imm Why can this not just use the existing UNSPECV_MFFSCRN? That way, an mffsrcn can automatically morph into an mffscrni if the argument is a constant integer, etc. > @@ -6333,9 +6342,15 @@ (define_expand "rs6000_set_fpscr_rn" > new rounding mode bits from operands[0][62:63] into FPSCR[62:63]. */ >if (TARGET_P9_MISC) > { > - rtx src_df = force_reg (DImode, operands[0]); > - src_df = simplify_gen_subreg (DFmode, src_df, DImode, 0); > - emit_insn (gen_rs6000_mffscrn (tmp_df, src_df)); > + if (CONST_INT_P (operands[0]) > + && const_0_to_3_operand (operands[0], VOIDmode)) const_0_to_3_operand already checks that CONST_INT_P is true (so you do not need to test the latter separately). > @@ -6357,7 +6372,8 @@ (define_expand "rs6000_set_fpscr_rn" >rtx tmp_di = gen_reg_rtx (DImode); > >/* Extract new RN mode from operand. */ > - emit_insn (gen_anddi3 (tmp_rn, operands[0], GEN_INT (0x3))); > + rtx op0 = convert_to_mode (DImode, operands[0], false); > + emit_insn (gen_anddi3 (tmp_rn, op0, GEN_INT (0x3))); Please write 3 as 3. Not as 0x0003, not as 0x03, not as 0x3. I realise you just copied this, but :-) > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/mffscrni_p9.c > @@ -0,0 +1,9 @@ > +/* { dg-do compile { target { has_arch_pwr9 } } } */ Don't do that? Instead, use -mdejagnu-cpu=power9. This will give more coverage, and also will make us need less future test maintenance (if on Power28 we will generate different code for this, for example). > +void __attribute__ ((noinline)) wrap_set_fpscr_rn (int val) > +{ > + __builtin_set_fpscr_rn (val); > +} Should be noipa, not just noinline? Segher
Re: [PATCH v4] c-format: Add -Wformat-int-precision option [PR80060]
On Sat, 27 Nov 2021 22:18:23 + Daniil Stas wrote: > This option is enabled by default when -Wformat option is enabled. A > user can specify -Wno-format-int-precision to disable emitting > warnings when passing an argument of an incompatible integer type to > a 'd', 'i', 'b', 'B', 'o', 'u', 'x', or 'X' conversion specifier when > it has the same precision as the expected type. > > Signed-off-by: Daniil Stas > > gcc/c-family/ChangeLog: > > * c-format.c (check_format_types): Don't emit warnings when > passing an argument of an incompatible integer type to > a 'd', 'i', 'b', 'B', 'o', 'u', 'x', or 'X' conversion > specifier when it has the same precision as the expected type > if -Wno-format-int-precision option is specified. > * c.opt: Add -Wformat-int-precision option. > > gcc/ChangeLog: > > * doc/invoke.texi: Add -Wformat-int-precision option > description. > > gcc/testsuite/ChangeLog: > > * c-c++-common/Wformat-int-precision-1.c: New test. > * c-c++-common/Wformat-int-precision-2.c: New test. > --- > Changes for v4: > - Added 'b' and 'B' format specifiers to the option descriptions. > > Changes for v3: > - Added additional @code{} derictives to the documentation where > needed. > - Changed tests to run on "! long_neq_int" target instead of "lp64". > - Added a test case to check that gcc still emits warnings for > arguments with different precision even with > -Wno-format-int-precision option enabled. > > Changes for v2: > - Changed the option name to -Wformat-int-precision. > - Changed the option description as was suggested by Martin. > - Changed Wformat-int-precision-2.c to use dg-bogus instead of > previous invalid syntax. > > gcc/c-family/c-format.c | 2 +- > gcc/c-family/c.opt | 6 ++ > gcc/doc/invoke.texi | 17 > - .../c-c++-common/Wformat-int-precision-1.c | > 7 +++ .../c-c++-common/Wformat-int-precision-2.c | 8 > 5 files changed, 38 insertions(+), 2 deletions(-) > create mode 100644 > gcc/testsuite/c-c++-common/Wformat-int-precision-1.c create mode > 100644 gcc/testsuite/c-c++-common/Wformat-int-precision-2.c > > diff --git a/gcc/c-family/c-format.c b/gcc/c-family/c-format.c > index e735e092043..c66787f931f 100644 > --- a/gcc/c-family/c-format.c > +++ b/gcc/c-family/c-format.c > @@ -4248,7 +4248,7 @@ check_format_types (const substring_loc > _loc, && (!pedantic || i < 2) > && char_type_flag) > continue; > - if (types->scalar_identity_flag > + if ((types->scalar_identity_flag || !warn_format_int_precision) > && (TREE_CODE (cur_type) == TREE_CODE (wanted_type) > || (INTEGRAL_TYPE_P (cur_type) > && INTEGRAL_TYPE_P (wanted_type))) > diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt > index 4b8a094b206..d7d952765c6 100644 > --- a/gcc/c-family/c.opt > +++ b/gcc/c-family/c.opt > @@ -684,6 +684,12 @@ C ObjC C++ LTO ObjC++ Warning > Alias(Wformat-overflow=, 1, 0) IntegerRange(0, 2) Warn about function > calls with format strings that write past the end of the destination > region. Same as -Wformat-overflow=1. > +Wformat-int-precision > +C ObjC C++ ObjC++ Var(warn_format_int_precision) Warning > LangEnabledBy(C ObjC C++ ObjC++,Wformat=,warn_format >= 1, 0) +Warn > when passing an argument of an incompatible integer type to a 'd', > 'i', +'b', 'B', 'o', 'u', 'x', or 'X' conversion specifier even when > it has the same +precision as the expected type. + > Wformat-security > C ObjC C++ ObjC++ Var(warn_format_security) Warning LangEnabledBy(C > ObjC C++ ObjC++,Wformat=, warn_format >= 2, 0) Warn about possible > security problems with format functions. diff --git > a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index > 3bddfbaae6a..94a7ad96c50 100644 --- a/gcc/doc/invoke.texi > +++ b/gcc/doc/invoke.texi > @@ -351,7 +351,7 @@ Objective-C and Objective-C++ Dialects}. > -Werror -Werror=* -Wexpansion-to-defined -Wfatal-errors @gol > -Wfloat-conversion -Wfloat-equal -Wformat -Wformat=2 @gol > -Wno-format-contains-nul -Wno-format-extra-args @gol > --Wformat-nonliteral -Wformat-overflow=@var{n} @gol > +-Wformat-nonliteral -Wformat-overflow=@var{n} > -Wformat-int-precision @gol -Wformat-security -Wformat-signedness > -Wformat-truncation=@var{n} @gol -Wformat-y2k -Wframe-address @gol > -Wframe-larger-than=@var{byte-size} -Wno-free-nonheap-object @gol > @@ -6122,6 +6122,21 @@ If @option{-Wformat} is specified, also warn > if the format string is not a string literal and so cannot be > checked, unless the format function takes its format arguments as a > @code{va_list}. > +@item -Wformat-int-precision > +@opindex Wformat-int-precision > +@opindex Wno-format-int-precision > +Warn when passing an argument of an incompatible integer type to > +a @samp{d}, @samp{i}, @samp{b}, @samp{B}, @samp{o}, @samp{u}, > @samp{x}, +or @samp{X}
[PATCH v2] ix86: Don't use the 'm' constraint for x86_64_general_operand
On Mon, Dec 20, 2021 at 12:38 PM Jakub Jelinek wrote: > > On Mon, Dec 20, 2021 at 11:44:08AM -0800, H.J. Lu wrote: > > The problem is in > > > > (define_memory_constraint "TARGET_MEM_CONSTRAINT" > > "Matches any valid memory." > > (and (match_code "mem") > >(match_test "memory_address_addr_space_p (GET_MODE (op), XEXP (op, > > 0), > > MEM_ADDR_SPACE (op))"))) > > > > define_register_constraint allows LRA to convert the operand to the form > > '(mem (reg X))', where X is a base register. I am testing the v2 patch with > > If you mean replacing an immediate with a MEM containing that immediate, > isn't that often the right thing though? > I mean, if the register pressure is high and options are either spill some > register, set it to immediate, use it in one instruction and then fill the > spilled register (i.e. 2 memory loads), compared to a MEM use on the > arithmetic instruction one MEM seems cheaper to me. With -fPIC and the > cst needing runtime relocation slightly less so of course. > We will check the performance impact on SPEC CPU 2017. Here is the v2 patch. Liwei, can you help collect SPEC CPU 2017 impact of the enclosed patch? Thanks. > The code due to ivopts is trying to have something like > size_t a = (size_t) _list; > size_t b = 0xffa8 - a; > size_t c = x + b; > and for that cst - one needs actually 2 registers, one to hold the > constant and one to hold the (%rip) based address. > (insn 790 789 791 111 (set (reg:DI 292) > (const_int -88 [0xffa8])) "dl-tunables.c":304:15 76 > {*movdi_internal} > (nil)) > (insn 791 790 792 111 (set (reg:DI 293) > (symbol_ref:DI ("tunable_list") [flags 0x2] tunable_list>)) "dl-tunables.c":304:15 76 {*movdi_internal} > (nil)) > (insn 792 791 793 111 (parallel [ > (set (reg:DI 291) > (minus:DI (reg:DI 292) > (reg:DI 293))) > (clobber (reg:CC 17 flags)) > ]) "dl-tunables.c":304:15 299 {*subdi_1} > (nil)) > (insn 793 792 794 111 (parallel [ > (set (reg:DI 294) > (plus:DI (reg:DI 291) > (reg:DI 198 [ ivtmp.176 ]))) > (clobber (reg:CC 17 flags)) > ]) "dl-tunables.c":304:15 226 {*adddi_1} > (nil)) > It would be smarter to rewrite the above into a lea 88+tunable_list(%rip), > %temp1 > and use a subtraction instead of addition in the last insn above, or of > course in the particular case even consider the following 2 instructions > that do: > (insn 794 793 795 111 (set (reg:DI 296) > (symbol_ref:DI ("tunable_list") [flags 0x2] tunable_list>)) "dl-tunables.c":304:15 76 {*movdi_internal} > (nil)) > (insn 795 794 796 111 (parallel [ > (set (reg:DI 295 [ cur ]) > (plus:DI (reg:DI 294) > (reg:DI 296))) > (clobber (reg:CC 17 flags)) > ]) "dl-tunables.c":304:15 226 {*adddi_1} > (nil)) > and find out that _list - _list is 0 and we don't need it at > all. Guess we don't figure that out due to the cast of one of those > addresses to size_t and the other one used in POINTER_PLUS_EXPR as normal > pointer. > > Jakub > -- H.J. From 45676709be2a39920e0e9326049676e6dfdd7885 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Sun, 19 Dec 2021 08:47:03 -0800 Subject: [PATCH v2] ix86: Don't use the 'm' constraint for x86_64_general_operand The 'm' constraint is defined with define_memory_constraint which allows LRA to convert the operand to the form '(mem (reg X))', where X is a base register. To prevent LRA from generating '(mem (reg X))' from a register: 1. Add a 'BM' constraint which is similar to the 'm' constraint, but is defined with define_constraint. 2. Add a 'm' mode attribute which is mapped to the 'm' constraint for general_operand and the 'BM' constraint for x86_64_general_operand. 3. Replace the 'm' constraint on with the '' constraint. 4. Replace the 'm' constraint on x86_64_general_operand with the 'BM' constraint. gcc/ PR target/103762 * config/i386/constraints.md (BM): New constraint. * config/i386/i386.md (m): New mode attribute. Replace the 'm' constraint on with the '' constraint. Replace the 'm' constraint on x86_64_general_operand with the 'BM' constraint. gcc/testsuite/ * gcc.target/i386/pr103762-1a.c: New test. * gcc.target/i386/pr103762-1b.c: Likewise. * gcc.target/i386/pr103762-1c.c: Likewise. --- gcc/config/i386/constraints.md | 10 + gcc/config/i386/i386.md | 67 +- gcc/testsuite/gcc.target/i386/pr103762-1a.c | 647 gcc/testsuite/gcc.target/i386/pr103762-1b.c | 7 + gcc/testsuite/gcc.target/i386/pr103762-1c.c | 7 + 5 files changed, 706 insertions(+), 32 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr103762-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr103762-1b.c create mode 100644
[PATCH] PR fortran/103777 - ICE in gfc_simplify_maskl, at fortran/simplify.c:4918
Dear all, we need to check the arguments of the elemental MASKL and MASKR intrinsics also before simplifying. Testcase by Gerhard. The fix is almost obvious, but I'm happy to get feedback in case there is something I overlooked. (There is already a check on scalar arguments to MASKL/MASKR, which however misses the case of array arguments.) Regtested on x86_64-pc-linux-gnu. OK for mainline? Thanks, Harald From b58a44bc861ee3d1e67e3b7c949a301b6290c05c Mon Sep 17 00:00:00 2001 From: Harald Anlauf Date: Mon, 20 Dec 2021 22:59:53 +0100 Subject: [PATCH] Fortran: check arguments of MASKL/MASKR intrinsics before simplification gcc/fortran/ChangeLog: PR fortran/103777 * simplify.c (gfc_simplify_maskr): Check validity of argument 'I' before simplifying. (gfc_simplify_maskl): Likewise. gcc/testsuite/ChangeLog: PR fortran/103777 * gfortran.dg/masklr_3.f90: New test. --- gcc/fortran/simplify.c | 6 ++ gcc/testsuite/gfortran.dg/masklr_3.f90 | 14 ++ 2 files changed, 20 insertions(+) create mode 100644 gcc/testsuite/gfortran.dg/masklr_3.f90 diff --git a/gcc/fortran/simplify.c b/gcc/fortran/simplify.c index 90067b6bbe6..b6f636d4ff1 100644 --- a/gcc/fortran/simplify.c +++ b/gcc/fortran/simplify.c @@ -4878,6 +4878,9 @@ gfc_simplify_maskr (gfc_expr *i, gfc_expr *kind_arg) bool fail = gfc_extract_int (i, ); gcc_assert (!fail); + if (!gfc_check_mask (i, kind_arg)) +return _bad_expr; + result = gfc_get_constant_expr (BT_INTEGER, kind, >where); /* MASKR(n) = 2^n - 1 */ @@ -4909,6 +4912,9 @@ gfc_simplify_maskl (gfc_expr *i, gfc_expr *kind_arg) bool fail = gfc_extract_int (i, ); gcc_assert (!fail); + if (!gfc_check_mask (i, kind_arg)) +return _bad_expr; + result = gfc_get_constant_expr (BT_INTEGER, kind, >where); /* MASKL(n) = 2^bit_size - 2^(bit_size - n) */ diff --git a/gcc/testsuite/gfortran.dg/masklr_3.f90 b/gcc/testsuite/gfortran.dg/masklr_3.f90 new file mode 100644 index 000..eb689f0f408 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/masklr_3.f90 @@ -0,0 +1,14 @@ +! { dg-do compile } +! PR fortran/103777 - ICE in gfc_simplify_maskl +! Contributed by G.Steinmetz + +program p + print *, maskl([999]) ! { dg-error "must be less than or equal" } + print *, maskr([999]) ! { dg-error "must be less than or equal" } + print *, maskl([-999]) ! { dg-error "must be nonnegative" } + print *, maskr([-999]) ! { dg-error "must be nonnegative" } + print *, maskl([32],kind=4) + print *, maskl([33],kind=4) ! { dg-error "must be less than or equal" } + print *, maskl([64],kind=8) + print *, maskl([65],kind=8) ! { dg-error "must be less than or equal" } +end -- 2.26.2
[PATCH] Register --sysroot in the driver switches table
Hello, This change adjusts the processing of --sysroot to save the option in the internal "switches" array, which lets self-specs test for it and provide a default value possibly dependent on environment variables, as in --with-specs=%{!-sysroot*:--sysroot=%:getenv("WIND_BASE" /target)} This helps the use we have of self specs for VxWorks, and was bootstrapped and regression tested on native 64bit linux. Ok to commit ? Thanks in advance, With Kind Regards, Olivier >From 964829ee06ffe1bedcbab6a3b4c92c5d161aaaed Mon Sep 17 00:00:00 2001 From: Olivier Hainque Date: Mon, 20 Dec 2021 17:47:24 + Subject: [PATCH] Register --sysroot in the driver switches table This change adjusts the processing of --sysroot to save the option in the internal "switches" array, which lets self-specs test for it and provide a default value possibly dependent on environment variables, as in --with-specs=%{!-sysroot*:--sysroot=%:getenv("WIND_BASE" /target)} 2021-12-20 Olivier Hainque gcc/ * gcc.c (driver_handle_option): do_save --sysroot. --- gcc/gcc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gcc/gcc.c b/gcc/gcc.c index b75b50b87b2..beb37fe980e 100644 --- a/gcc/gcc.c +++ b/gcc/gcc.c @@ -4488,7 +4488,9 @@ driver_handle_option (struct gcc_options *opts, case OPT__sysroot_: target_system_root = arg; target_system_root_changed = 1; - do_save = false; + /* Saving this option is useful to let self-specs decide to + provide a default one. */ + do_save = true; break; case OPT_time_: -- 2.25.1
[PATCH] PR fortran/103778 - [10/11/12 Regression] ICE: Invalid expression in gfc_element_size
Dear all, again found by Gerhard: using a BOZ literal constant in situations where an interoperable object is expected can lead to an ICE. But obviously a BOZ in not interoperable. Obvious patch, regtested on x86_64-pc-linux-gnu. Will commit within 48h unless there are objections or better suggestions. Thanks, Harald From 2e6c83fbddda3215faf111263ebfc754bc07096c Mon Sep 17 00:00:00 2001 From: Harald Anlauf Date: Mon, 20 Dec 2021 22:12:33 +0100 Subject: [PATCH] Fortran: BOZ literal constants are not interoperable gcc/fortran/ChangeLog: PR fortran/103778 * check.c (is_c_interoperable): A BOZ literal constant is not interoperable. gcc/testsuite/ChangeLog: PR fortran/103778 * gfortran.dg/illegal_boz_arg_3.f90: New test. --- gcc/fortran/check.c | 6 ++ gcc/testsuite/gfortran.dg/illegal_boz_arg_3.f90 | 7 +++ 2 files changed, 13 insertions(+) create mode 100644 gcc/testsuite/gfortran.dg/illegal_boz_arg_3.f90 diff --git a/gcc/fortran/check.c b/gcc/fortran/check.c index 625473c90d1..b4db9337e9f 100644 --- a/gcc/fortran/check.c +++ b/gcc/fortran/check.c @@ -5185,6 +5185,12 @@ is_c_interoperable (gfc_expr *expr, const char **msg, bool c_loc, bool c_f_ptr) return false; } + if (expr->ts.type == BT_BOZ) +{ + *msg = "BOZ literal constant"; + return false; +} + if (expr->ts.type == BT_CLASS) { *msg = "Expression is polymorphic"; diff --git a/gcc/testsuite/gfortran.dg/illegal_boz_arg_3.f90 b/gcc/testsuite/gfortran.dg/illegal_boz_arg_3.f90 new file mode 100644 index 000..59fefa90971 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/illegal_boz_arg_3.f90 @@ -0,0 +1,7 @@ +! { dg-do compile } +! PR fortran/103778 + +program p + use iso_c_binding, only : c_sizeof + integer, parameter :: a = c_sizeof(z'1') ! { dg-error "cannot appear" } +end -- 2.26.2
[PATCH] PR fortran/103776 - ICE in gfc_compare_string, at fortran/arith.c:1118
Dear all, we need to check that expressions in CASE selectors are scalar, and reject them early if they aren't. Testcase by Gerhard. The fix is obvious and sort of a follow-up to the fix for PR103591. Regtested on x86_64-pc-linux-gnu. Will commit as obvious within 48h unless there are objections or other comments. Thanks, Harald From 70385de214fe4de289c56e7f4e06a4b252414379 Mon Sep 17 00:00:00 2001 From: Harald Anlauf Date: Mon, 20 Dec 2021 22:01:05 +0100 Subject: [PATCH] Fortran: CASE selector expressions must be scalar gcc/fortran/ChangeLog: PR fortran/103776 * match.c (match_case_selector): Reject expressions in CASE selector which are not scalar. gcc/testsuite/ChangeLog: PR fortran/103776 * gfortran.dg/select_10.f90: New test. --- gcc/fortran/match.c | 13 + gcc/testsuite/gfortran.dg/select_10.f90 | 25 + 2 files changed, 38 insertions(+) create mode 100644 gcc/testsuite/gfortran.dg/select_10.f90 diff --git a/gcc/fortran/match.c b/gcc/fortran/match.c index 52bc5af7542..617fb35c9cd 100644 --- a/gcc/fortran/match.c +++ b/gcc/fortran/match.c @@ -6088,6 +6088,19 @@ match_case_selector (gfc_case **cp) } } + if (c->low && c->low->rank != 0) +{ + gfc_error ("Expression in CASE selector at %L must be scalar", + >low->where); + goto cleanup; +} + if (c->high && c->high->rank != 0) +{ + gfc_error ("Expression in CASE selector at %L must be scalar", + >high->where); + goto cleanup; +} + *cp = c; return MATCH_YES; diff --git a/gcc/testsuite/gfortran.dg/select_10.f90 b/gcc/testsuite/gfortran.dg/select_10.f90 new file mode 100644 index 000..2d9b0170ce9 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/select_10.f90 @@ -0,0 +1,25 @@ +! { dg-do compile } +! PR fortran/103776 - ICE in gfc_compare_string +! Contributed by G.Steinmetz + +program p + integer :: n + select case (n) + case ([1])! { dg-error "must be scalar" } + end select + select case (n) + case (:[2]) ! { dg-error "must be scalar" } + end select + select case (n) + case (['1']) ! { dg-error "must be scalar" } + end select + select case (n) + case (['1']:2)! { dg-error "must be scalar" } + end select + select case (n) + case(['1']:['2']) ! { dg-error "must be scalar" } + end select + select case (n) + case(1:['2']) ! { dg-error "must be scalar" } + end select +end -- 2.26.2
Re: [PATCH 1/2] libphobos: fix CET for non-glibc targets
> On 20/12/2021 16:41 Alex Xu (Hello71) wrote: > > > Excerpts from ibuc...@gdcproject.org's message of December 20, 2021 8:56 am: > >> On 20/12/2021 01:08 Alex Xu (Hello71) via Gcc-patches > >> wrote: > >> > >> > >> On musl, linking against libphobos fails because it requires ucontext > >> but is not explicitly linked against it. This is caused by configure > >> assuming that it is implemented in assembly, but it is actually not > >> implemented. This silently works on other libcs because context API does > >> not require an external library. > > > > Thanks. > > > > Looks reasonable to me, also for backporting to gcc-11, which also has the > > same CET support. > > > > Iain. > > > > Yes, we noticed this first on gcc 11. I tested that these patches fix > the issue on gcc 11, and since nothing seems to have changed, I think > the same problem exists and will be fixed by these patches on master. > Do you need me to commit this? Iain.
Re: [PATCH] c-family: Have -Wformat-diag accept "decl-specifier" [PR103758]
On Mon, Dec 20, 2021 at 03:21:04PM -0500, Jason Merrill via Gcc-patches wrote: > > @@ -3215,6 +3215,14 @@ check_tokens (const token_t *tokens, unsigned ntoks, > > plural = "s"; > > } > > + /* As an exception, don't warn about "decl-specifier*" since > > +it's a C++ grammar production. */ > > + { > > + const size_t l = strlen ("decl-specifier"); > > + if (!strncmp (format_chars, "decl-specifier", l)) > > + return format_chars + l - 1; > > + } > > I'd prefer to avoid repeating the string literal, but OK. We have the startswith inline function that allows to avoid the repetition. It wouldn't work in the above case due to the return format_chars + l - 1, but that in itself looks quite weird to me, I'd think better would be continue; instead of the return ...;, i.e. whenever we see decl-specifier in the text pretend we haven't seen decl. So /* As an exception, don't warn about "decl-specifier*" since it's a C++ grammar production. */ if (startswith (format_chars, "decl-specifier")) continue; ? Or perhaps even optimize and don't compare uselessly everything with "decl-specifier" and do it only for the "decl" badword if it matches, so if (badwords[i].name[0] == 'd' && startswith (format_chars, "decl-specifier")) continue; ? Jakub
Re: [PATCH] ix86: Don't match the 'm' constraint on x86_64_general_operand
On Mon, Dec 20, 2021 at 11:44:08AM -0800, H.J. Lu wrote: > The problem is in > > (define_memory_constraint "TARGET_MEM_CONSTRAINT" > "Matches any valid memory." > (and (match_code "mem") >(match_test "memory_address_addr_space_p (GET_MODE (op), XEXP (op, 0), > MEM_ADDR_SPACE (op))"))) > > define_register_constraint allows LRA to convert the operand to the form > '(mem (reg X))', where X is a base register. I am testing the v2 patch with If you mean replacing an immediate with a MEM containing that immediate, isn't that often the right thing though? I mean, if the register pressure is high and options are either spill some register, set it to immediate, use it in one instruction and then fill the spilled register (i.e. 2 memory loads), compared to a MEM use on the arithmetic instruction one MEM seems cheaper to me. With -fPIC and the cst needing runtime relocation slightly less so of course. The code due to ivopts is trying to have something like size_t a = (size_t) _list; size_t b = 0xffa8 - a; size_t c = x + b; and for that cst - one needs actually 2 registers, one to hold the constant and one to hold the (%rip) based address. (insn 790 789 791 111 (set (reg:DI 292) (const_int -88 [0xffa8])) "dl-tunables.c":304:15 76 {*movdi_internal} (nil)) (insn 791 790 792 111 (set (reg:DI 293) (symbol_ref:DI ("tunable_list") [flags 0x2] )) "dl-tunables.c":304:15 76 {*movdi_internal} (nil)) (insn 792 791 793 111 (parallel [ (set (reg:DI 291) (minus:DI (reg:DI 292) (reg:DI 293))) (clobber (reg:CC 17 flags)) ]) "dl-tunables.c":304:15 299 {*subdi_1} (nil)) (insn 793 792 794 111 (parallel [ (set (reg:DI 294) (plus:DI (reg:DI 291) (reg:DI 198 [ ivtmp.176 ]))) (clobber (reg:CC 17 flags)) ]) "dl-tunables.c":304:15 226 {*adddi_1} (nil)) It would be smarter to rewrite the above into a lea 88+tunable_list(%rip), %temp1 and use a subtraction instead of addition in the last insn above, or of course in the particular case even consider the following 2 instructions that do: (insn 794 793 795 111 (set (reg:DI 296) (symbol_ref:DI ("tunable_list") [flags 0x2] )) "dl-tunables.c":304:15 76 {*movdi_internal} (nil)) (insn 795 794 796 111 (parallel [ (set (reg:DI 295 [ cur ]) (plus:DI (reg:DI 294) (reg:DI 296))) (clobber (reg:CC 17 flags)) ]) "dl-tunables.c":304:15 226 {*adddi_1} (nil)) and find out that _list - _list is 0 and we don't need it at all. Guess we don't figure that out due to the cast of one of those addresses to size_t and the other one used in POINTER_PLUS_EXPR as normal pointer. Jakub
Re: [PATCH] c-family: Have -Wformat-diag accept "decl-specifier" [PR103758]
On 12/17/21 17:59, Marek Polacek wrote: I'm tired of seeing cp/parser.c:15923:55: warning: misspelled term 'decl' in format; use 'declaration' instead [-Wformat-diag] cp/parser.c:15925:57: warning: misspelled term 'decl' in format; use 'declaration' instead [-Wformat-diag] every time I compile cp/parser.c, which happens...a lot. I'd like my compilation to be free of warnings, otherwise I'm going to miss some important ones. "decl-specifiers" is a C++ grammar term; it is not actual code, so should not be wrapped with %< %>. I hope we can accept it as an exception in check_tokens. It was surrounded by %< %> in cp_parser_decl_specifier_seq, so fix that. In passing, fix a misspelling in missspellings. Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk? In fact, I think I'd like to backport to 11 too, so that eventually even my system compiler stops warning about this. PR c++/103758 gcc/c-family/ChangeLog: * c-format.c (check_tokens): Accept "decl-specifier*". gcc/cp/ChangeLog: * parser.c (cp_parser_decl_specifier_seq): Replace % with %qD. gcc/testsuite/ChangeLog: * g++.dg/cpp0x/constexpr-condition.C: Adjust dg-error. --- gcc/c-family/c-format.c | 10 +- gcc/cp/parser.c | 2 +- gcc/testsuite/g++.dg/cpp0x/constexpr-condition.C | 2 +- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/gcc/c-family/c-format.c b/gcc/c-family/c-format.c index 617fb5ea626..e1b1d6ba31e 100644 --- a/gcc/c-family/c-format.c +++ b/gcc/c-family/c-format.c @@ -3194,7 +3194,7 @@ check_tokens (const token_t *tokens, unsigned ntoks, wlen, format_chars); else { - /* Diagnose some common missspellings. */ + /* Diagnose some common misspellings. */ for (unsigned i = 0; i != sizeof badwords / sizeof *badwords; ++i) { unsigned badwlen = strspn (badwords[i].name, " -"); @@ -3215,6 +3215,14 @@ check_tokens (const token_t *tokens, unsigned ntoks, plural = "s"; } + /* As an exception, don't warn about "decl-specifier*" since +it's a C++ grammar production. */ + { + const size_t l = strlen ("decl-specifier"); + if (!strncmp (format_chars, "decl-specifier", l)) + return format_chars + l - 1; + } I'd prefer to avoid repeating the string literal, but OK. format_warning_substr (format_string_loc, format_string_cst, fmtchrpos, fmtchrpos + badwords[i].len, opt, diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 44eed7ea638..3b33ae0cc21 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -15820,7 +15820,7 @@ cp_parser_decl_specifier_seq (cp_parser* parser, if (found_decl_spec && (flags & CP_PARSER_FLAGS_ONLY_TYPE_OR_CONSTEXPR) && token->keyword != RID_CONSTEXPR) - error ("% invalid in condition"); + error ("%qD invalid in condition", ridpointers[token->keyword]); if (found_decl_spec && (flags & CP_PARSER_FLAGS_ONLY_MUTABLE_OR_CONSTEXPR) diff --git a/gcc/testsuite/g++.dg/cpp0x/constexpr-condition.C b/gcc/testsuite/g++.dg/cpp0x/constexpr-condition.C index 733d494c4d7..e81acba68ae 100644 --- a/gcc/testsuite/g++.dg/cpp0x/constexpr-condition.C +++ b/gcc/testsuite/g++.dg/cpp0x/constexpr-condition.C @@ -5,5 +5,5 @@ constexpr int something() { return 3; } int main() { if (constexpr long v = something()) {} - if (static long v = something()) { } // { dg-error "'decl-specifier' invalid" } + if (static long v = something()) { } // { dg-error "'static' invalid" } } base-commit: d7ca2a79b82c6500ead6ab983d14c609e2124eee
[PATCH] i386: Fix _pinsr and its splitters [PR103772]
The clever trick to duplicate the value of the input operand into itself proved not so clever after all. The splitter should not clobber the input operand in any case, since the register can hold the value outside the HImode lowpart when accessed as subreg. Use the standard earlyclobber approach instead. The testcase fails with avx2 ISA, but I was not able to create the testcase that wouldn't require -mavx512fp16 compile flag. 2021-12-20 Uroš Bizjak gcc/ChangeLog: PR target/103772 * config/i386/sse.md (_pinsr): Add earlyclobber to (x,x,x,i) alternative. (_pinsr peephole2): Remove. (_pinsr splitter): Use output operand as a temporary register. Split after reload_completed. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Pushed to master. Uros. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5196149ee32..cb1c0b1edec 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -17430,7 +17430,7 @@ ;; sse4_1_pinsrd must come before sse2_loadld since it is preferred. (define_insn "_pinsr" - [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v,x") + [(set (match_operand:PINSR_MODE 0 "register_operand" "=x,x,x,x,v,v,") (vec_merge:PINSR_MODE (vec_duplicate:PINSR_MODE (match_operand: 2 "nonimmediate_operand" "r,m,r,m,r,m,x")) @@ -17499,25 +17499,6 @@ (const_string "*")))]) ;; For TARGET_AVX2, implement insert from XMM reg with PBROADCASTW + PBLENDW. -;; First try to get a scratch register and go through it. In case this fails, -;; overwrite source reg with broadcasted value and blend from there. -(define_peephole2 - [(match_scratch:V8_128 4 "x") - (set (match_operand:V8_128 0 "sse_reg_operand") - (vec_merge:V8_128 - (vec_duplicate:V8_128 - (match_operand: 2 "sse_reg_operand")) - (match_operand:V8_128 1 "sse_reg_operand") - (match_operand:SI 3 "const_int_operand")))] - "TARGET_AVX2 - && INTVAL (operands[3]) > 1 - && ((unsigned) exact_log2 (INTVAL (operands[3])) - < GET_MODE_NUNITS (mode))" - [(set (match_dup 4) - (vec_duplicate:V8_128 (match_dup 2))) - (set (match_dup 0) - (vec_merge:V8_128 (match_dup 4) (match_dup 1) (match_dup 3)))]) - (define_split [(set (match_operand:V8_128 0 "sse_reg_operand") (vec_merge:V8_128 @@ -17525,18 +17506,14 @@ (match_operand: 2 "sse_reg_operand")) (match_operand:V8_128 1 "sse_reg_operand") (match_operand:SI 3 "const_int_operand")))] - "TARGET_AVX2 && epilogue_completed + "TARGET_AVX2 && reload_completed && INTVAL (operands[3]) > 1 && ((unsigned) exact_log2 (INTVAL (operands[3])) < GET_MODE_NUNITS (mode))" - [(set (match_dup 4) + [(set (match_dup 0) (vec_duplicate:V8_128 (match_dup 2))) (set (match_dup 0) - (vec_merge:V8_128 (match_dup 4) (match_dup 1) (match_dup 3)))] -{ - operands[4] = lowpart_subreg (mode, operands[2], - mode); -}) + (vec_merge:V8_128 (match_dup 0) (match_dup 1) (match_dup 3)))]) (define_expand "_vinsert_mask" [(match_operand:AVX512_VEC 0 "register_operand")
Re: [PATCH] [gfortran] Add support for allocate clause (OpenMP 5.0).
On Thu, Nov 18, 2021 at 07:30:36PM +, Hafiz Abid Qadeer wrote: > + if (gfc_match (" : ") != MATCH_YES) > + { > + /* If no ":" then there is no allocator, we backtrack > + and read the variable list. */ > + gfc_free_expr (allocator); > + allocator = NULL; > + gfc_current_locus = old_loc; > + } Ok, no leak above. > + > + gfc_omp_namelist **head = NULL; > + m = gfc_match_omp_variable_list ("", >lists[OMP_LIST_ALLOCATE], > +false, NULL, ); > + > + if (m == MATCH_ERROR) > + break; But here it leaks. Just call gfc_free_expr (allocator); before break. > + > + gfc_omp_namelist *n; > + for (n = *head; n; n = n->next) > + if (allocator) > + n->expr = gfc_copy_expr (allocator); > + else > + n->expr = NULL; > + gfc_free_expr (allocator); > + continue; > + } > if ((mask & OMP_CLAUSE_AT) > && (m = gfc_match_dupl_check (c->at == OMP_AT_UNSET, "at", true)) >!= MATCH_NO) > + if (omp_clauses->lists[OMP_LIST_ALLOCATE]) > +{ > + for (n = omp_clauses->lists[OMP_LIST_ALLOCATE]; n; n = n->next) > + if (n->expr && (n->expr->ts.type != BT_INTEGER > + || n->expr->ts.kind != gfc_c_intptr_kind)) > + { > + gfc_error ("Expected integer expression of the " > + "'omp_allocator_handle_kind' kind at %L", >expr->where); Formatting, "' should be indented below "Expected > + break; > + } > + > + /* Check for 2 things here. > + 1. There is no duplication of variable in allocate clause. > + 2. Variable in allocate clause are also present in some > + privatization clase (non-composite case). */ > + for (n = omp_clauses->lists[OMP_LIST_ALLOCATE]; n; n = n->next) > + n->sym->omp_allocate_clause = 0; > + > + gfc_omp_namelist *prev = NULL; > + for (n = omp_clauses->lists[OMP_LIST_ALLOCATE]; n;) > + { > + if (n->sym->omp_allocate_clause == 1) > + { > + gfc_warning (0, "%qs appears more than once in % " > +"clauses at %L" , n->sym->name, >where); > + /* We have already seen this variable so it is a duplicate. > + Remove it. */ > + if (prev != NULL && prev->next == n) > + { > + prev->next = n->next; > + n->next = NULL; > + gfc_free_omp_namelist (n, 0); > + n = prev->next; > + } > + continue; > + } > + n->sym->omp_allocate_clause = 1; > + prev = n; > + n = n->next; > + } > + > + /* non-composite constructs. */ > + if (code && code->op < EXEC_OMP_DO_SIMD) > + { > + for (list = 0; list < OMP_LIST_NUM; list++) > + switch (list) > + { > + case OMP_LIST_PRIVATE: > + case OMP_LIST_FIRSTPRIVATE: > + case OMP_LIST_LASTPRIVATE: > + case OMP_LIST_REDUCTION: > + case OMP_LIST_REDUCTION_INSCAN: > + case OMP_LIST_REDUCTION_TASK: > + case OMP_LIST_IN_REDUCTION: > + case OMP_LIST_TASK_REDUCTION: > + case OMP_LIST_LINEAR: > + for (n = omp_clauses->lists[list]; n; n = n->next) > + n->sym->omp_allocate_clause = 0; > + break; > + default: > + break; > + } > + > + for (n = omp_clauses->lists[OMP_LIST_ALLOCATE]; n; n = n->next) > + if (n->sym->omp_allocate_clause == 1) > + gfc_error ("%qs specified in 'allocate' clause at %L but not " > + "in an explicit privatization clause", > + n->sym->name, >where); > + } > +} Do you really need a new omp_allocate_clause bit? From what I can see, other code uses n->sym->mark for such purposes (temporarily marking some symbols). Also, I think allocate clause like the privatization clauses should allow common blocks and I see code that uses the marks uses something like: for (list = OMP_LIST_TO; list != OMP_LIST_NUM; list = (list == OMP_LIST_TO ? OMP_LIST_LINK : OMP_LIST_NUM)) for (n = c->lists[list]; n; n = n->next) if (n->sym) n->sym->mark = 0; else if (n->u.common->head) n->u.common->head->mark = 0; So, a question is if the above won't just crash if I specify firstprivate(/foobar/) allocate(/foobar/) etc. > + case OMP_LIST_ALLOCATE: > + for (; n != NULL; n = n->next) > + if (n->sym->attr.referenced || declare_simd) !$omp declare simd doesn't allow allocate clause, so why the above " || declare_simd"? > + { > + tree t = gfc_trans_omp_variable (n->sym, declare_simd); And not , false); above? Jakub
Re: [PATCH] ix86: Don't match the 'm' constraint on x86_64_general_operand
On Mon, Dec 20, 2021 at 6:53 AM Jakub Jelinek wrote: > > On Sun, Dec 19, 2021 at 12:06:30PM -0800, H.J. Lu via Gcc-patches wrote: > > --- a/gcc/config/i386/predicates.md > > +++ b/gcc/config/i386/predicates.md > > @@ -1199,6 +1199,12 @@ > >(and (match_operand 0 "memory_operand") > > (not (match_test "x86_extended_reg_mentioned_p (op)" > > > > +;; Return true if OP is a memory operand representable on ix86. > > +(define_predicate "ix86_memory_operand" > > + (and (match_operand 0 "memory_operand") > > + (ior (match_test "mode == QImode || mode == HImode") > > + (match_operand 0 "x86_64_general_operand" > > I must be missing something, but how can this work? > x86_64_general_operand is: > (define_predicate "x86_64_general_operand" > (if_then_else (match_test "TARGET_64BIT") > (ior (match_operand 0 "nonimmediate_operand") > (match_operand 0 "x86_64_immediate_operand")) > (match_operand 0 "general_operand"))) > and so for non-immediates like MEMs it is just a normal memory_operand. > > Jakub > The problem is in (define_memory_constraint "TARGET_MEM_CONSTRAINT" "Matches any valid memory." (and (match_code "mem") (match_test "memory_address_addr_space_p (GET_MODE (op), XEXP (op, 0), MEM_ADDR_SPACE (op))"))) define_register_constraint allows LRA to convert the operand to the form '(mem (reg X))', where X is a base register. I am testing the v2 patch with ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg X))' ;; where X is a base register. (define_constraint "BM" "@internal x86-64 memory operand." (and (match_code "mem") (match_test "memory_address_addr_space_p (GET_MODE (op), XEXP (op, 0), MEM_ADDR_SPACE (op))"))) ;; Memory operand constraint for word modes. (define_mode_attr m [(QI "m") (HI "m") (SI "BM") (DI "BM")]) -- H.J.
Re: [PATCH] Use enclosing object size if it's smaller than member [PR 101475]
On 12/16/2021 12:56 PM, Martin Sebor via Gcc-patches wrote: Enabling vectorization at -O2 caused quite a few tests for warnings to start failing in GCC 12. These tests were xfailed and bugs were opened to track the problems until they can be fully analyzed and ultimately fixed before GCC 12 is released. I've now started going through these and the first such bug I tackled is PR 102944. As it turns out, the xfails there are all due to a known limitation tracked in PR 101475: when determining the size of a destination for A COMPONENT_REF, unless asked for the size of the complete object, compute_objsize() only considers the size of the referenced member, even when the member is larger than the object would allow. This prevents warnings from diagnosing unvectorized past-the-end accesses to objects in backing buffers (such as in character arrays or allocated chunks of memory). Many (though not all) accesses that are vectorized are diagnosed because there the COMPONENT_REF is replaced by a MEM_REF. But because vectorization depends on target-specific things like alignment requirements, what is and isn't diagnosed also tends to be target-specific, making these tests quite brittle.. The attached patch corrects this oversight by using the complete object's size instead of the member when the former is smaller. Besides improving the out-of-bounds access detection it also makes the tests behave more consistently across targets. Tested on x86_64-linux and by building Glibc and verifying that the change triggers no new warnings. I must be missing something here. How can the enclosing object be smaller than a member? Jeff
Re: [PATCH] c++: Avoid narrowing in make_char_string_pack
On 12/17/21 17:58, Marek Polacek wrote: This fixes gcc/cp/parser.c:4618:41: warning: narrowing conversion of '(char)(*(str + ((sizetype)i)))' from 'char' to 'unsigned char' [-Wnarrowing] 4618 | unsigned char s[3] = { '\'', str[i], '\'' }; |~^ Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk? Hmm, odd that STRING_CST and cpp_string differ in the use of unsigned char. The patch is OK> gcc/cp/ChangeLog: * parser.c (make_char_string_pack): Add a cast to const unsigned char *. --- gcc/cp/parser.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index 44eed7ea638..56232ab029f 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -4607,7 +4607,8 @@ make_char_string_pack (tree value) { tree charvec; tree argpack = make_node (NONTYPE_ARGUMENT_PACK); - const char *str = TREE_STRING_POINTER (value); + const unsigned char *str += (const unsigned char *) TREE_STRING_POINTER (value); int i, len = TREE_STRING_LENGTH (value) - 1; tree argvec = make_tree_vec (1); base-commit: d7ca2a79b82c6500ead6ab983d14c609e2124eee
Re: [PATCH] c++: memfn lookup consistency in incomplete class context
On 12/20/21 12:46, Patrick Palka wrote: When instantiating a call to a member function of a class template, we repeat the member function lookup in order to obtain the corresponding partially instantiated functions. Within an incomplete-class context however, we need to be more careful when repeating the lookup because we don't want to introduce later-declared member functions that weren't visible at template definition time. We're currently not careful enough in this respect, which causes us to reject memfn1.C below. This patch fixes this issue by making tsubst_baselink filter out from the instantiation-time lookup those member functions that were invisible at template definition time. This is really only necessary within an incomplete-class context, so this patch adds a heuristic flag to BASELINK to help us avoid needlessly performing this filtering step (which would be a no-op) in complete-class contexts. This is also necessary in order for the ahead-of-time pruning implemented in r12-6075 to be effective for member functions within class templates. Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for trunk? gcc/cp/ChangeLog: * call.c (build_new_method_call): Set BASELINK_FUNCTIONS_MAYBE_INCOMPLETE_P on the pruned baselink. * cp-tree.h (BASELINK_FUNCTIONS_MAYBE_INCOMPLETE_P): Define. * pt.c (filter_memfn_lookup): New subroutine of tsubst_baselink. (tsubst_baselink): Use filter_memfn_lookup on the new lookup result when BASELINK_FUNCTIONS_MAYBE_INCOMPLETE_P is set on the old baselink. Remove redundant BASELINK_P check. * search.c (build_baselink): Set BASELINK_FUNCTIONS_MAYBE_INCOMPLETE_P appropriately. gcc/testsuite/ChangeLog: * g++.dg/lookup/memfn1.C: New test. * g++.dg/template/non-dependent16b.C: New test. --- gcc/cp/call.c | 1 + gcc/cp/cp-tree.h | 5 + gcc/cp/pt.c | 91 ++- gcc/cp/search.c | 4 + gcc/testsuite/g++.dg/lookup/memfn1.C | 16 .../g++.dg/template/non-dependent16b.C| 37 6 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/g++.dg/lookup/memfn1.C create mode 100644 gcc/testsuite/g++.dg/template/non-dependent16b.C diff --git a/gcc/cp/call.c b/gcc/cp/call.c index 1fbfc580a1e..bee367f57d7 100644 --- a/gcc/cp/call.c +++ b/gcc/cp/call.c @@ -11187,6 +11187,7 @@ build_new_method_call (tree instance, tree fns, vec **args, } orig_fns = copy_node (orig_fns); BASELINK_FUNCTIONS (orig_fns) = fn; + BASELINK_FUNCTIONS_MAYBE_INCOMPLETE_P (orig_fns) = true; } skip_prune: diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h index 8b5cfa230ba..4f175f491cb 100644 --- a/gcc/cp/cp-tree.h +++ b/gcc/cp/cp-tree.h @@ -464,6 +464,7 @@ extern GTY(()) tree cp_global_trees[CPTI_MAX]; PACK_EXPANSION_SIZEOF_P (in *_PACK_EXPANSION) OVL_USING_P (in OVERLOAD) IMPLICIT_CONV_EXPR_NONTYPE_ARG (in IMPLICIT_CONV_EXPR) + BASELINK_FUNCTIONS_MAYBE_INCOMPLETE_P (in BASELINK) 2: IDENTIFIER_KIND_BIT_2 (in IDENTIFIER_NODE) ICS_THIS_FLAG (in _CONV) DECL_INITIALIZED_BY_CONSTANT_EXPRESSION_P (in VAR_DECL) @@ -1060,6 +1061,10 @@ struct GTY(()) tree_template_decl { /* Nonzero if this baselink was from a qualified lookup. */ #define BASELINK_QUALIFIED_P(NODE) \ TREE_LANG_FLAG_0 (BASELINK_CHECK (NODE)) +/* Nonzero if the overload set for this baselink might be incomplete due + to the lookup being performed from an incomplete class context. */ +#define BASELINK_FUNCTIONS_MAYBE_INCOMPLETE_P(NODE) \ + TREE_LANG_FLAG_1 (BASELINK_CHECK (NODE)) struct GTY(()) tree_baselink { struct tree_common common; diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c index 4f0ae6d5851..7293f4e7b12 100644 --- a/gcc/cp/pt.c +++ b/gcc/cp/pt.c @@ -16221,6 +16221,81 @@ tsubst (tree t, tree args, tsubst_flags_t complain, tree in_decl) } } +/* OLDFNS is a lookup set of member functions from some class template, and + NEWFNS is a lookup set of member functions from a specialization of that + class template. Return the subset of NEWFNS which are specializations of + some function from OLDFNS. */ + +static tree +filter_memfn_lookup (tree oldfns, tree newfns) +{ + /* Record all member functions from the old lookup set OLDFNS into + VISIBLE_SET. */ + hash_set visible_set; + for (tree fn : lkp_range (oldfns)) +{ + if (TREE_CODE (fn) == USING_DECL) + { + /* FIXME: Punt on (dependent) USING_DECL for now; mapping +a dependent USING_DECL to its instantiation seems +tricky. */ I wouldn't expect it to be that bad; the instantiation would be any functions from bases of the class where the USING_DECL (and the lookup) occurs. But that can be a follow-up; the
[committed] d: Merge upstream dmd ad8412530, druntime fd9a4544, phobos 495e835c2.
Hi, This patch merges the D front-end with upstream dmd ad8412530, and the run-time libraries with upstream druntime fd9a4544 and phobos 495e835c2. D front-end changes: - Import dmd v2.098.1 - Remove calling of _d_delstruct from code generator. Druntime changes: - Import druntime v2.098.1 Phobos changes: - Import phobos v2.098.1 Bootstrapped and regression tested on x86_64-linux-gnu/-m32/-mx32, and committed to mainline. Regards, Iain. --- gcc/d/ChangeLog: * dmd/MERGE: Merge upstream dmd ad8412530. * expr.cc (ExprVisitor::visit (DeleteExp *)): Remove code generation of _d_delstruct. * runtime.def (DELSTRUCT): Remove. libphobos/ChangeLog: * libdruntime/MERGE: Merge upstream druntime fd9a4544. * src/MERGE: Merge upstream phobos 495e835c2. --- gcc/d/dmd/MERGE | 2 +- gcc/d/dmd/canthrow.d | 16 ++ gcc/d/dmd/dcast.d | 73 +++ gcc/d/dmd/dinterpret.d| 41 gcc/d/dmd/dsymbol.d | 16 +- gcc/d/dmd/dsymbolsem.d| 24 ++- gcc/d/dmd/expressionsem.d | 26 ++- gcc/d/dmd/id.d| 3 + gcc/d/dmd/initsem.d | 106 ++- gcc/d/dmd/nogc.d | 14 ++ gcc/d/dmd/semantic3.d | 3 +- gcc/d/dmd/tokens.d| 26 +-- gcc/d/dmd/tokens.h| 11 +- gcc/d/dmd/typesem.d | 178 +- gcc/d/expr.cc | 12 +- gcc/d/runtime.def | 2 - gcc/testsuite/gdc.test/compilable/test22593.d | 13 ++ .../gdc.test/fail_compilation/ice17074.d | 12 +- .../gdc.test/fail_compilation/test22593.d | 23 +++ libphobos/libdruntime/MERGE | 2 +- libphobos/libdruntime/core/builtins.d | 48 - libphobos/libdruntime/core/lifetime.d | 20 +- libphobos/libdruntime/core/sys/linux/sched.d | 3 + libphobos/libdruntime/object.d| 30 ++- libphobos/src/MERGE | 2 +- libphobos/src/std/format/write.d | 23 +++ libphobos/src/std/range/interfaces.d | 9 + libphobos/src/std/typecons.d | 2 +- 28 files changed, 468 insertions(+), 272 deletions(-) create mode 100644 gcc/testsuite/gdc.test/compilable/test22593.d create mode 100644 gcc/testsuite/gdc.test/fail_compilation/test22593.d diff --git a/gcc/d/dmd/MERGE b/gcc/d/dmd/MERGE index d7eff4ffd2f..b42576c2ce6 100644 --- a/gcc/d/dmd/MERGE +++ b/gcc/d/dmd/MERGE @@ -1,4 +1,4 @@ -93108bb9ea6216d67fa97bb4842fb59f26f6bfc7 +ad8412530e607ffebec36f2dbdff1a6f2798faf7 The first line of this file holds the git revision number of the last merge done from the dlang/dmd repository. diff --git a/gcc/d/dmd/canthrow.d b/gcc/d/dmd/canthrow.d index b67a9d14dd4..b1877151c88 100644 --- a/gcc/d/dmd/canthrow.d +++ b/gcc/d/dmd/canthrow.d @@ -82,6 +82,22 @@ extern (C++) bool canThrow(Expression e, FuncDeclaration func, bool mustNotThrow if (global.errors && !ce.e1.type) return; // error recovery + +import dmd.id : Id; + +if (ce.f && ce.f.ident == Id._d_delstruct) +{ +// Only check if the dtor throws. +Type tb = (*ce.arguments)[0].type.toBasetype(); +auto ts = tb.nextOf().baseElemOf().isTypeStruct(); +if (ts) +{ +auto sd = ts.sym; +if (sd.dtor) +checkFuncThrows(ce, sd.dtor); +} +} + /* If calling a function or delegate that is typed as nothrow, * then this expression cannot throw. * Note that pure functions can throw. diff --git a/gcc/d/dmd/dcast.d b/gcc/d/dmd/dcast.d index 2e5a79d0feb..a572a1ff6e0 100644 --- a/gcc/d/dmd/dcast.d +++ b/gcc/d/dmd/dcast.d @@ -1565,9 +1565,9 @@ Expression castTo(Expression e, Scope* sc, Type t, Type att = null) result = e; return; } -if (e.op == EXP.variable) +if (auto ve = e.isVarExp()) { -VarDeclaration v = (cast(VarExp)e).var.isVarDeclaration(); +VarDeclaration v = ve.var.isVarDeclaration(); if (v && v.storage_class & STC.manifest) { result = e.ctfeInterpret(); @@ -1852,8 +1852,8 @@ Expression castTo(Expression e, Scope* sc, Type t, Type att = null) override void visit(StructLiteralExp e) { visit(cast(Expression)e); -if (result.op == EXP.structLiteral) -(cast(StructLiteralExp)result).stype = t; // commit type +if (auto sle =
Re: [gcc r12-6020] Fixed typo
Hi Martin, > On 12/16/21 19:43, Joseph Myers wrote: >> On Thu, 16 Dec 2021, Martin Liška wrote: >> >>> Hello. >>> >>> Oh, sorry, it was me and I forgot to send the patch to the mailing list. >>> I've basically taken it as: https://github.com/gcc-mirror/gcc/pull/57. >>> >>> Should I revert the change? >> Updating both config.sub and config.guess to the latest config.git >> versions (2021-10-27 and 2021-11-30) is probably the right thing to do >> now. > > All right, I've reverted the original patch and I'm going to do sync > from upstream. please make config.sub executable again. This has been lost with the update. Rainer -- - Rainer Orth, Center for Biotechnology, Bielefeld University
Re: [PATCH] rs6000: Replace UNSPECS with ss_plus/us_plus and ss_minus/us_minus
On Mon, Dec 20, 2021 at 3:24 AM Xionghu Luo wrote: > > These four UNSPECS seems could be replaced with native RTL, and why > "(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))" > in the RTL pattern, per ISA of VSCR bit 127(VECTOR Saturation, SAT): > > This bit is sticky; that is, once set to 1 it > remains set to 1 until it is set to 0 by an > mtvscr instruction. > > The RTL pattern set it to 0 but final ASM doesn't present it? And why > not use Clobber VSCR_REGNO instead? The design came from the early implementation of Altivec: https://gcc.gnu.org/pipermail/gcc-patches/2002-May/077409.html If one later checks for saturation (reads VSCR), one needs a corresponding SET of the value. It's set in an architecture-specific manner that isn't described to GCC, but it's set, not just clobbered and in an undefined state. The RTL does not describe that VSCR is set to the value 0. The (const_int 0) is not the value set. You can think of the (const_int 0) as a dummy RTL argument to the VSCR UNSPEC. UNSPEC requires at least one argument and the pattern doesn't try to express the argument, so it uses a dummy RTL constant. It's part of a PARALLEL and the plus or minus already expresses the data dependency of the pattern on the input operands. I'm unsure of the meaning of your question "final ASM doesn't present it". The operation on VSCR is implicit and not emitted as an instruction. It's in a PARALLEL, which means that the single Altivec instruction has both effects. Is that what you were asking? > Tested pass on P10, OK for master? This patch is okay. Thanks for updating the machine description and for cleaning up the formatting. > Thanks. > > gcc/ChangeLog: > > * config/rs6000/altivec.md (altivec_vaddus): Replace > UNSPEC_VADDU with us_plus. > (altivec_vaddss): Replace UNSPEC_VADDS with ss_plus. > (altivec_vsubus): Replace UNSPEC_VSUBU with us_minus. > (altivec_vsubss): Replace UNSPEC_VSUBS with ss_minus. > (altivec_abss_): Likewise. > --- > gcc/config/rs6000/altivec.md | 29 ++--- > 1 file changed, 10 insertions(+), 19 deletions(-) > > diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md > index a057218aa28..b2909857c34 100644 > --- a/gcc/config/rs6000/altivec.md > +++ b/gcc/config/rs6000/altivec.md > @@ -29,8 +29,6 @@ (define_c_enum "unspec" > UNSPEC_VMHADDSHS > UNSPEC_VMHRADDSHS > UNSPEC_VADDCUW > - UNSPEC_VADDU > - UNSPEC_VADDS > UNSPEC_VAVGU > UNSPEC_VAVGS > UNSPEC_VMULEUB > @@ -61,8 +59,6 @@ (define_c_enum "unspec" > UNSPEC_VSR > UNSPEC_VSRO > UNSPEC_VSUBCUW > - UNSPEC_VSUBU > - UNSPEC_VSUBS > UNSPEC_VSUM4UBS > UNSPEC_VSUM4S > UNSPEC_VSUM2SWS > @@ -517,9 +513,8 @@ (define_insn "altivec_vaddcuw" > > (define_insn "altivec_vaddus" >[(set (match_operand:VI 0 "register_operand" "=v") > -(unspec:VI [(match_operand:VI 1 "register_operand" "v") > - (match_operand:VI 2 "register_operand" "v")] > - UNSPEC_VADDU)) > +(us_plus:VI (match_operand:VI 1 "register_operand" "v") > + (match_operand:VI 2 "register_operand" "v"))) > (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] >"" >"vaddus %0,%1,%2" > @@ -527,9 +522,8 @@ (define_insn "altivec_vaddus" > > (define_insn "altivec_vaddss" >[(set (match_operand:VI 0 "register_operand" "=v") > -(unspec:VI [(match_operand:VI 1 "register_operand" "v") > -(match_operand:VI 2 "register_operand" "v")] > - UNSPEC_VADDS)) > +(ss_plus:VI (match_operand:VI 1 "register_operand" "v") > + (match_operand:VI 2 "register_operand" "v"))) > (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] >"VECTOR_UNIT_ALTIVEC_P (mode)" >"vaddss %0,%1,%2" > @@ -563,9 +557,8 @@ (define_insn "altivec_vsubcuw" > > (define_insn "altivec_vsubus" >[(set (match_operand:VI 0 "register_operand" "=v") > -(unspec:VI [(match_operand:VI 1 "register_operand" "v") > -(match_operand:VI 2 "register_operand" "v")] > - UNSPEC_VSUBU)) > +(us_minus:VI (match_operand:VI 1 "register_operand" "v") > +(match_operand:VI 2 "register_operand" "v"))) > (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] >"VECTOR_UNIT_ALTIVEC_P (mode)" >"vsubus %0,%1,%2" > @@ -573,9 +566,8 @@ (define_insn "altivec_vsubus" > > (define_insn "altivec_vsubss" >[(set (match_operand:VI 0 "register_operand" "=v") > -(unspec:VI [(match_operand:VI 1 "register_operand" "v") > -(match_operand:VI 2 "register_operand" "v")] > - UNSPEC_VSUBS)) > +(ss_minus:VI (match_operand:VI 1 "register_operand" "v") > +(match_operand:VI 2 "register_operand" "v"))) > (set (reg:SI VSCR_REGNO)
Re: [PATCH] Adjust VxWorks fixincludes hack for mkdir to work for C++
> On 20 Dec 2021, at 17:31, Jeff Law wrote: > > Given how vxworks specific these fixinc bits are, I think they reasonably > fall under maintainership of vxworks bits. > > So I think you can/should self-approve :-) Thanks Jeff, Rasmus has provided useful feedback on some of the hunks and I'm still looking into some of the issues. I'll proceed with those on which we're aligned, and on the other ones when we have converged on a resolution. Cheers, Olivier
Re: [PATCH] Adjust VxWorks fixincludes hack for mkdir to work for C++
On 12/17/2021 4:55 AM, Olivier Hainque via Gcc-patches wrote: Hello, The attached patch adjusts a very old fixincludes hack for VxWorks, to expose a varargs function prototype for "mkdir" instead of a varargs macro (!). The function version is more amenable to calls from C++ like std::mkdir(arg1, arg2). This helps libstdc++ build failures for old versions of VxWorks, 6.9 in particular for which we were able to get a successful complete build after this and a couple of other fixincludes changes which I'll send separately. Also bootstrapped and regression tested ok for x86_64-linux, just in case. Ok to commit? Thanks in advance, With Kind Regards, Olivier 2021-12-16 Olivier Hainque fixincludes/ * inclhack.def (vxworks_posix_mkdir): Refine to expose a varargs interface. * tests/base/sys/stat.h: Update expected results. * fixinc.x: Regenerate. Given how vxworks specific these fixinc bits are, I think they reasonably fall under maintainership of vxworks bits. So I think you can/should self-approve :-) jeff
Re: [PATCH] Leverage sysroot for VxWorks
Hi Rasmus, > On 20 Dec 2021, at 12:40, Rasmus Villemoes wrote: > > On 10/12/2021 19.24, Olivier Hainque wrote: > >> For the toolchains we build, this is achieved with a few >> configure options like: >> >> --with-sysroot >> --with-build-sysroot=${WIND_BASE}/target > > So forward-porting our private patches up until > > 7bf710b5116 - libstdc++: Add support for '?' in linker script globs > > (i.e. the commit before this one) went without problems, with no changes > required in our build scripts. Then when rebasing to this one, now known as > > f3f923e5139 - Leverage sysroot for VxWorks > > the build broke as expected because I'd been using somewhat different > values of --with(-build)-sysroot. However, changing those configure > options as indicated above again produced a working toolchain, so this > is all good. Great ! >> --with-specs=%{!sysroot=*:--sysroot=%:getenv(WIND_BASE /target)} > > However, I'm a little confused about the purpose of this one. > First, > shouldn't this be '%{!-sysroot=*}', i.e. with a leading dash, since > the option is --sysroot ? But whether I use one or the other, it seems > that the resulting compiler ignores a --sysroot argument; if I > explicitly unexport WIND_BASE and manually add a --sysroot argument that > should have the same effect as above, gcc fails with WIND_BASE not defined: > The specs syntax doesn't explicitly list the negative form of %{S*:X}, > and I can't find any in-tree examples (though it's rather hard to grep > for), so I don't even know if this is supposed to work. I think it is, and it is used for example in: config/i386/i386.h: {"cpu", "%{!mtune=*:%{!mcpu=*:%{!march=*:-mtune=%(VALUE)}}}" }, \ > It's not a big deal, we can just continue to define and export > WIND_BASE, but it's probably best for long-term maintenance if our build > scripts and configure options are aligned with what you do on your end, > so I would just like to understand this fully. Sure, appreciated. The purpose was indeed to also prevent producing the default option if one is there already, and you are right that the form above doesn't work as expected on this account. I had made a couple of basic tests and thought it did, but must have made a mistake. Might have been skewed by an implicit conviction. The reason for which it doesn't work is the do_save = false in driver_handle_option for --sysroot: case OPT__sysroot_: target_system_root = arg; target_system_root_changed = 1; do_save = false; break; Changing that has the intended effect on our use case, with an additional dash indeed. I'll test further and propose the change if it doesn't raise unexpected problems. We could also even consider using DRIVER_SELF_SPECS then. Thanks for your feedback, Olivier
RE: [3/3 PATCH][AArch32] use canonical ordering for complex mul, fma and fms
Updated version of patch following AArch64 review. Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues. Ok for master? and backport along with the first patch? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * config/arm/vec-common.md (cml4): Use canonical order. --- inline copy of patch --- diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index e71d9b3811fde62159f5c21944fef9fe3f97b4bd..eab77ac8decce76d70f5b2594f4439e6ed363e6e 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -265,18 +265,18 @@ (define_expand "arm_vcmla" ;; remainder. Because of this, expand early. (define_expand "cml4" [(set (match_operand:VF 0 "register_operand") - (plus:VF (match_operand:VF 1 "register_operand") -(unspec:VF [(match_operand:VF 2 "register_operand") -(match_operand:VF 3 "register_operand")] - VCMLA_OP)))] + (plus:VF (unspec:VF [(match_operand:VF 1 "register_operand") +(match_operand:VF 2 "register_operand")] + VCMLA_OP) +(match_operand:VF 3 "register_operand")))] "(TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT && ARM_HAVE__ARITH)) && !BYTES_BIG_ENDIAN" { rtx tmp = gen_reg_rtx (mode); - emit_insn (gen_arm_vcmla (tmp, operands[1], -operands[3], operands[2])); + emit_insn (gen_arm_vcmla (tmp, operands[3], +operands[2], operands[1])); emit_insn (gen_arm_vcmla (operands[0], tmp, -operands[3], operands[2])); +operands[2], operands[1])); DONE; }) rb15165.patch Description: rb15165.patch
RE: [2/3 PATCH]AArch64 use canonical ordering for complex mul, fma and fms
> -Original Message- > From: Richard Sandiford > Sent: Friday, December 17, 2021 4:49 PM > To: Tamar Christina > Cc: gcc-patches@gcc.gnu.org; nd ; Richard Earnshaw > ; Marcus Shawcroft > ; Kyrylo Tkachov > Subject: Re: [2/3 PATCH]AArch64 use canonical ordering for complex mul, > fma and fms > > Richard Sandiford writes: > > Tamar Christina writes: > >> Hi All, > >> > >> After the first patch in the series this updates the optabs to expect > >> the canonical sequence. > >> > >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > >> > >> Ok for master? and backport along with the first patch? > >> > >> Thanks, > >> Tamar > >> > >> gcc/ChangeLog: > >> > >>PR tree-optimization/102819 > >>PR tree-optimization/103169 > >>* config/aarch64/aarch64-simd.md > (cml4, > >>cmul3): Use canonical order. > >>* config/aarch64/aarch64-sve.md (cml4, > >>cmul3): Likewise. > >> > >> --- inline copy of patch -- > >> diff --git a/gcc/config/aarch64/aarch64-simd.md > >> b/gcc/config/aarch64/aarch64-simd.md > >> index > >> > f95a7e1d91c97c9e981d75e71f0b49c02ef748ba..875896ee71324712c8034eeff9 > c > >> fb5649f9b0e73 100644 > >> --- a/gcc/config/aarch64/aarch64-simd.md > >> +++ b/gcc/config/aarch64/aarch64-simd.md > >> @@ -556,17 +556,17 @@ (define_insn > "aarch64_fcmlaq_lane" > >> ;; remainder. Because of this, expand early. > >> (define_expand "cml4" > >>[(set (match_operand:VHSDF 0 "register_operand") > >> - (plus:VHSDF (match_operand:VHSDF 1 "register_operand") > >> - (unspec:VHSDF [(match_operand:VHSDF 2 > "register_operand") > >> - (match_operand:VHSDF 3 > "register_operand")] > >> - FCMLA_OP)))] > >> + (plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 1 > "register_operand") > >> + (match_operand:VHSDF 2 > "register_operand")] > >> + FCMLA_OP) > >> + (match_operand:VHSDF 3 "register_operand")))] > >>"TARGET_COMPLEX && !BYTES_BIG_ENDIAN" > >> { > >>rtx tmp = gen_reg_rtx (mode); > >> - emit_insn (gen_aarch64_fcmla (tmp, operands[1], > >> - operands[3], operands[2])); > >> + emit_insn (gen_aarch64_fcmla (tmp, operands[3], > >> + operands[1], operands[2])); > >>emit_insn (gen_aarch64_fcmla (operands[0], tmp, > >> - operands[3], operands[2])); > >> + operands[1], operands[2])); > >>DONE; > >> }) > >> > >> @@ -583,9 +583,9 @@ (define_expand "cmul3" > >>rtx tmp = force_reg (mode, CONST0_RTX (mode)); > >>rtx res1 = gen_reg_rtx (mode); > >>emit_insn (gen_aarch64_fcmla (res1, tmp, > >> - operands[2], operands[1])); > >> + operands[1], operands[2])); > >>emit_insn (gen_aarch64_fcmla (operands[0], res1, > >> - operands[2], operands[1])); > >> + operands[1], operands[2])); > > > > This doesn't look right. Going from the documentation, patch 1 isn't > > changing the operand order for CMUL: the conjugated operand (if there > > is one) is still operand 2. The FCMLA sequences use the opposite > > order, where the conjugated operand (if there is one) is operand 1. > > So I think > > I meant “the first multiplication operand” rather than “operand 1” here. > > > the reversal here is still needed. > > > > Same for the multiplication operands in CML* above. I did actually change the order in patch 1, but didn't update the docs.. That was done because I followed the SLP order again, but now I've updated them to do what the docs say. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? and backport along with the first patch? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * config/aarch64/aarch64-simd.md (cml4): Use canonical order. * config/aarch64/aarch64-sve.md (cml4): Likewise. --- inline copy of patch --- diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index f95a7e1d91c97c9e981d75e71f0b49c02ef748ba..9e41610fba85862ef7675bea1e5731b14cab59ce 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -556,17 +556,17 @@ (define_insn "aarch64_fcmlaq_lane" ;; remainder. Because of this, expand early. (define_expand "cml4" [(set (match_operand:VHSDF 0 "register_operand") - (plus:VHSDF (match_operand:VHSDF 1 "register_operand") - (unspec:VHSDF [(match_operand:VHSDF 2 "register_operand") - (match_operand:VHSDF 3 "register_operand")] - FCMLA_OP)))] + (plus:VHSDF (unspec:VHSDF [(match_operand:VHSDF 1
RE: [1/3 PATCH]middle-end vect: Simplify and extend the complex numbers validation routines.
> -Original Message- > From: Richard Sandiford > Sent: Friday, December 17, 2021 4:19 PM > To: Tamar Christina via Gcc-patches > Cc: Tamar Christina ; nd ; > rguent...@suse.de > Subject: Re: [1/3 PATCH]middle-end vect: Simplify and extend the complex > numbers validation routines. > > Just a comment on the documentation: > > Tamar Christina via Gcc-patches writes: > > diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index > > > 9ec051e94e10cca9eec2773e1b8c01b74b6ea4db..60dc5b3ea6087c2824ad1467 > bc66 > > e9cfebe9dcfc 100644 > > --- a/gcc/doc/md.texi > > +++ b/gcc/doc/md.texi > > @@ -6325,12 +6325,12 @@ Perform a vector multiply and accumulate that > > is semantically the same as a multiply and accumulate of complex numbers. > > > > @smallexample > > - complex TYPE c[N]; > > - complex TYPE a[N]; > > - complex TYPE b[N]; > > + complex TYPE op0[N]; > > + complex TYPE op1[N]; > > + complex TYPE op2[N]; > >for (int i = 0; i < N; i += 1) > > @{ > > - c[i] += a[i] * b[i]; > > + op2[i] += op1[i] * op2[i]; > > @} > > I think this should be: > > op0[i] = op1[i] * op2[i] + op3[i]; > > since operand 0 is the output and operand 3 is the accumulator input. > > Same idea for the others. For: > > > @@ -6415,12 +6415,12 @@ Perform a vector multiply that is semantically > > the same as multiply of complex numbers. > > > > @smallexample > > - complex TYPE c[N]; > > - complex TYPE a[N]; > > - complex TYPE b[N]; > > + complex TYPE op0[N]; > > + complex TYPE op1[N]; > > + complex TYPE op2[N]; > >for (int i = 0; i < N; i += 1) > > @{ > > - c[i] = a[i] * b[i]; > > + op2[i] = op0[i] * op1[i]; > > …this I think it should be: > > op0[i] = op1[i] * op2[i]; Updated patch attached. Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu and no regressions. Ok for master? and backport to GCC 11 after some stew? Thanks, Tamar gcc/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * doc/md.texi: Update docs for cfms, cfma. * tree-data-ref.h (same_data_refs): Accept optional offset. * tree-vect-slp-patterns.c (is_linear_load_p): Fix issue with repeating patterns. (vect_normalize_conj_loc): Remove. (is_eq_or_top): Change to take two nodes. (enum _conj_status, compatible_complex_nodes_p, vect_validate_multiplication): New. (class complex_add_pattern, complex_add_pattern::matches, complex_add_pattern::recognize, class complex_mul_pattern, complex_mul_pattern::recognize, class complex_fms_pattern, complex_fms_pattern::recognize, class complex_operations_pattern, complex_operations_pattern::recognize, addsub_pattern::recognize): Pass new cache. (complex_fms_pattern::matches, complex_mul_pattern::matches): Pass new cache and use new validation code. * tree-vect-slp.c (vect_match_slp_patterns_2, vect_match_slp_patterns, vect_analyze_slp): Pass along cache. (compatible_calls_p): Expose. * tree-vectorizer.h (compatible_calls_p, slp_node_hash, slp_compat_nodes_map_t): New. (class vect_pattern): Update signatures include new cache. gcc/testsuite/ChangeLog: PR tree-optimization/102819 PR tree-optimization/103169 * g++.dg/vect/pr99149.cc: xfail for now. * gcc.dg/vect/complex/pr102819-1.c: New test. * gcc.dg/vect/complex/pr102819-2.c: New test. * gcc.dg/vect/complex/pr102819-3.c: New test. * gcc.dg/vect/complex/pr102819-4.c: New test. * gcc.dg/vect/complex/pr102819-5.c: New test. * gcc.dg/vect/complex/pr102819-6.c: New test. * gcc.dg/vect/complex/pr102819-7.c: New test. * gcc.dg/vect/complex/pr102819-8.c: New test. * gcc.dg/vect/complex/pr102819-9.c: New test. * gcc.dg/vect/complex/pr103169.c: New test. --- inline copy of patch --- diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 9ec051e94e10cca9eec2773e1b8c01b74b6ea4db..ad06b02d36876082afe4c3f3fb51887f7a522b23 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -6325,12 +6325,13 @@ Perform a vector multiply and accumulate that is semantically the same as a multiply and accumulate of complex numbers. @smallexample - complex TYPE c[N]; - complex TYPE a[N]; - complex TYPE b[N]; + complex TYPE op0[N]; + complex TYPE op1[N]; + complex TYPE op2[N]; + complex TYPE op3[N]; for (int i = 0; i < N; i += 1) @{ - c[i] += a[i] * b[i]; + op0[i] = op1[i] * op2[i] + op3[i]; @} @end smallexample @@ -6348,12 +6349,13 @@ the same as a multiply and accumulate of complex numbers where the second multiply arguments is conjugated. @smallexample - complex TYPE c[N]; - complex TYPE a[N]; - complex TYPE b[N]; + complex TYPE op0[N]; + complex TYPE op1[N]; + complex TYPE op2[N]; + complex TYPE op3[N]; for (int i = 0; i < N; i += 1) @{ - c[i] +=
Re: [PING] Fix size of static array in gcc.dg/vect/vect-simd-20.c
> On 20 Dec 2021, at 16:32, Jeff Law wrote: > > > > On 12/17/2021 4:21 AM, Olivier Hainque via Gcc-patches wrote: >> Hello, >> >> Ping for https://gcc.gnu.org/pipermail/gcc-patches/2021-November/583222.html >> >> please ? >> >> Thanks in advance! > OK for the trunk. > Sorry it got lost. Thanks Jeff, no pb at all. All busy :)
[PATCH] libgomp, OpenMP, nvptx: Low-latency memory allocator
This patch is submitted now for review and so I can commit a backport it to the OG11 branch, but isn't suitable for mainline until stage 1. The patch implements support for omp_low_lat_mem_space and omp_low_lat_mem_alloc on NVPTX offload devices. The omp_pteam_mem_alloc, omp_cgroup_mem_alloc and omp_thread_mem_alloc allocators are also configured to use this space (this to match the current or intended behaviour in other toolchains). The memory is drawn from the ".shared" space that is accessible only from within the team in which it is allocated, and which effectively ceases to exist when the kernel exits. By default, 8 KiB of space is reserved for each team at launch time. This can be adjusted, at runtime, via a new environment variable "GOMP_NVPTX_LOWLAT_POOL". Reserving a larger amount may limit the number of teams that can be run in parallel (due to hardware limitations). Conversely, reducing the allocation may increase the number of teams that can be run in parallel. (I have not yet attempted to tune the default too precisely.) The actual maximum size will vary according to the available hardware and the number of variables that the compiler has placed in .shared space. The allocator implementation is designed to add no extra space-overhead than omp_alloc already does (aside from rounding allocations up to a multiple of 8 bytes), thus the internal free and realloc must be told how big the original allocation was. The free algorithm maintains an in-order linked-list of free memory chunks. Memory is allocated on a first-fit basis. If the allocation fails the NVPTX allocator returns NULL and omp_alloc handles the fall-back. Now that this is a thing that is likely to happen (low-latency memory is small) this patch also implements appropriate fall-back modes for the predefined allocators (fall-back for custom allocators already worked). In order to support the %dynamic_smem_size PTX feature is is necessary to bump the minimum supported PTX version from 3.1 (~2013) to 4.1 (~2014). OK for stage 1? Andrewlibgomp, nvptx: low-latency memory allocator This patch adds support for allocating low-latency ".shared" memory on NVPTX GPU device, via the omp_low_lat_mem_space and omp_alloc. The memory can be allocated, reallocated, and freed using a basic but fast algorithm, is thread safe and the size of the low-latency heap can be configured using the GOMP_NVPTX_LOWLAT_POOL environment variable. The use of the PTX dynamic_smem_size feature means that the minimum version requirement is now bumped to 4.1 (still old at this point). gcc/ChangeLog: * config/nvptx/nvptx.c (nvptx_file_start): Bump minimum PTX version to 4.1. libgomp/ChangeLog: * allocator.c (MEMSPACE_ALLOC): New macro. (MEMSPACE_CALLOC): New macro. (MEMSPACE_REALLOC): New macro. (MEMSPACE_FREE): New macro. (dynamic_smem_size): New constants. (omp_alloc): Use MEMSPACE_ALLOC. Implement fall-backs for predefined allocators. (omp_free): Use MEMSPACE_FREE. (omp_calloc): Use MEMSPACE_CALLOC. Implement fall-backs for predefined allocators. (omp_realloc): Use MEMSPACE_REALLOC. Implement fall-backs for predefined allocators. * config/nvptx/team.c (__nvptx_lowlat_heap_root): New variable. (__nvptx_lowlat_pool): New asm varaible. (gomp_nvptx_main): Initialize the low-latency heap. * plugin/plugin-nvptx.c (lowlat_pool_size): New variable. (GOMP_OFFLOAD_init_device): Read the GOMP_NVPTX_LOWLAT_POOL envvar. (GOMP_OFFLOAD_run): Apply lowlat_pool_size. * config/nvptx/allocator.c: New file. * testsuite/libgomp.c/allocators-1.c: New test. * testsuite/libgomp.c/allocators-2.c: New test. * testsuite/libgomp.c/allocators-3.c: New test. * testsuite/libgomp.c/allocators-4.c: New test. * testsuite/libgomp.c/allocators-5.c: New test. * testsuite/libgomp.c/allocators-6.c: New test. diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index ff44d9fdbef..9bc26d7de0c 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -5409,7 +5409,7 @@ nvptx_file_start (void) else if (TARGET_PTX_6_3) fputs ("\t.version\t6.3\n", asm_out_file); else -fputs ("\t.version\t3.1\n", asm_out_file); +fputs ("\t.version\t4.1\n", asm_out_file); if (TARGET_SM80) fputs ("\t.target\tsm_80\n", asm_out_file); else if (TARGET_SM75) diff --git a/libgomp/allocator.c b/libgomp/allocator.c index deebb6a79fa..b14f991c148 100644 --- a/libgomp/allocator.c +++ b/libgomp/allocator.c @@ -34,6 +34,38 @@ #define omp_max_predefined_alloc omp_thread_mem_alloc +/* These macros may be overridden in config//allocator.c. */ +#ifndef MEMSPACE_ALLOC +#define MEMSPACE_ALLOC(MEMSPACE, SIZE) \ + ((void)MEMSPACE, malloc (SIZE)) +#endif +#ifndef MEMSPACE_CALLOC +#define MEMSPACE_CALLOC(MEMSPACE, SIZE) \ +
Re: [PATCH, rs6000] Implement mffscrni pattern
On Mon, Dec 20, 2021 at 12:56 AM HAO CHEN GUI wrote: > > Hi, > I modified the patch according to David and Segher's advice. > > This patch defines a pattern for mffscrni. If the RN is a constant, it can > call > gen_rs6000_mffscrni directly. The "rs6000-builtin-new.def" defines prototype > for builtin arguments. > The pattern "rs6000_set_fpscr_rn" is then broken as the mode of its argument > is DI while its > corresponding builtin has a const int argument. The patch also fixed it. > > Bootstrapped and tested on powerpc64-linux BE and LE with no regressions. > Is this okay for trunk? > Any recommendations? Thanks a lot. > > ChangeLog > 2021-12-17 Haochen Gui > > gcc/ > * config/rs6000/predicates.md (u2bit_cint_operand): Defined. > * config/rs6000/rs6000-call.c > (rs6000_expand_set_fpscr_rn_builtin): Not copy argument to a reg if > it's a constant. The pattern for constant can be recognized now. > * config/rs6000/rs6000.md (UNSPECV_MFFSCRNI): Defined. > (rs6000_mffscrni): Defined. > (rs6000_set_fpscr_rn): Change the type of operand[0] form DI to SI. > Call gen_rs6000_mffscrni when operand[0] is a const int[0,3]. > > gcc/testsuite/ > * gcc.target/powerpc/mffscrni_p9.c: New testcase for mffscrni. > * gcc.target/powerpc/test_fpscr_rn_builtin.c: Modify the test cases to > test mffscrn and mffscrni separately. This revised patch is okay. Thanks, David > > > patch.diff > diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md > index f216ffdf410..b10b4ce6065 100644 > --- a/gcc/config/rs6000/predicates.md > +++ b/gcc/config/rs6000/predicates.md > @@ -219,6 +219,11 @@ (define_predicate "u1bit_cint_operand" >(and (match_code "const_int") > (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 1"))) > > +;; Return 1 if op is an unsigned 2-bit constant integer. > +(define_predicate "u2bit_cint_operand" > + (and (match_code "const_int") > + (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 3"))) > + > ;; Return 1 if op is a unsigned 3-bit constant integer. > (define_predicate "u3bit_cint_operand" >(and (match_code "const_int") > diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c > index d9736eaf21c..81261a0f24d 100644 > --- a/gcc/config/rs6000/rs6000-call.c > +++ b/gcc/config/rs6000/rs6000-call.c > @@ -9610,13 +9610,15 @@ rs6000_expand_set_fpscr_rn_builtin (enum insn_code > icode, tree exp) > compile time if the argument is a variable. The least significant two > bits of the argument, regardless of type, are used to set the rounding > mode. All other bits are ignored. */ > - if (CONST_INT_P (op0) && !const_0_to_3_operand(op0, VOIDmode)) > + if (CONST_INT_P (op0)) > { > - error ("Argument must be a value between 0 and 3."); > - return const0_rtx; > + if (!const_0_to_3_operand (op0, VOIDmode)) > + { > + error ("Argument must be a value between 0 and 3."); > + return const0_rtx; > + } > } > - > - if (! (*insn_data[icode].operand[0].predicate) (op0, mode0)) > + else if (! (*insn_data[icode].operand[0].predicate) (op0, mode0)) > op0 = copy_to_mode_reg (mode0, op0); > >pat = GEN_FCN (icode) (op0); > diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md > index 6bec2bddbde..b18746af7ea 100644 > --- a/gcc/config/rs6000/rs6000.md > +++ b/gcc/config/rs6000/rs6000.md > @@ -177,6 +177,7 @@ (define_c_enum "unspecv" > UNSPECV_MFFS; Move from FPSCR > UNSPECV_MFFSL ; Move from FPSCR light instruction version > UNSPECV_MFFSCRN ; Move from FPSCR float rounding mode > + UNSPECV_MFFSCRNI; Move from FPSCR float rounding mode with imm > UNSPECV_MFFSCDRN; Move from FPSCR decimal float rounding mode > UNSPECV_MTFSF ; Move to FPSCR Fields 8 to 15 > UNSPECV_MTFSF_HI; Move to FPSCR Fields 0 to 7 > @@ -6315,6 +6316,14 @@ (define_insn "rs6000_mffscrn" > "mffscrn %0,%1" >[(set_attr "type" "fp")]) > > +(define_insn "rs6000_mffscrni" > + [(set (match_operand:DF 0 "gpc_reg_operand" "=d") > + (unspec_volatile:DF [(match_operand:SI 1 "u2bit_cint_operand" "n")] > + UNSPECV_MFFSCRNI))] > + "TARGET_P9_MISC" > + "mffscrni %0,%1" > + [(set_attr "type" "fp")]) > + > (define_insn "rs6000_mffscdrn" >[(set (match_operand:DF 0 "gpc_reg_operand" "=d") > (unspec_volatile:DF [(const_int 0)] UNSPECV_MFFSCDRN)) > @@ -6324,7 +6333,7 @@ (define_insn "rs6000_mffscdrn" >[(set_attr "type" "fp")]) > > (define_expand "rs6000_set_fpscr_rn" > - [(match_operand:DI 0 "reg_or_cint_operand")] > + [(match_operand:SI 0 "reg_or_cint_operand")] >"TARGET_HARD_FLOAT" > { >rtx tmp_df = gen_reg_rtx (DFmode); > @@ -6333,9 +6342,15 @@ (define_expand "rs6000_set_fpscr_rn" > new rounding mode bits from operands[0][62:63]
Re: [PATCH 1/2] libphobos: fix CET for non-glibc targets
Excerpts from ibuc...@gdcproject.org's message of December 20, 2021 8:56 am: >> On 20/12/2021 01:08 Alex Xu (Hello71) via Gcc-patches >> wrote: >> >> >> On musl, linking against libphobos fails because it requires ucontext >> but is not explicitly linked against it. This is caused by configure >> assuming that it is implemented in assembly, but it is actually not >> implemented. This silently works on other libcs because context API does >> not require an external library. > > Thanks. > > Looks reasonable to me, also for backporting to gcc-11, which also has the > same CET support. > > Iain. > Yes, we noticed this first on gcc 11. I tested that these patches fix the issue on gcc 11, and since nothing seems to have changed, I think the same problem exists and will be fixed by these patches on master. Cheers, Alex.
Re: [PING] Fix size of static array in gcc.dg/vect/vect-simd-20.c
On 12/17/2021 4:21 AM, Olivier Hainque via Gcc-patches wrote: Hello, Ping for https://gcc.gnu.org/pipermail/gcc-patches/2021-November/583222.html please ? Thanks in advance! OK for the trunk. Sorry it got lost. jeff
Re: [PATCH] c-family: Have -Wformat-diag accept "decl-specifier" [PR103758]
On 12/17/2021 3:59 PM, Marek Polacek via Gcc-patches wrote: I'm tired of seeing cp/parser.c:15923:55: warning: misspelled term 'decl' in format; use 'declaration' instead [-Wformat-diag] cp/parser.c:15925:57: warning: misspelled term 'decl' in format; use 'declaration' instead [-Wformat-diag] every time I compile cp/parser.c, which happens...a lot. I'd like my compilation to be free of warnings, otherwise I'm going to miss some important ones. "decl-specifiers" is a C++ grammar term; it is not actual code, so should not be wrapped with %< %>. I hope we can accept it as an exception in check_tokens. It was surrounded by %< %> in cp_parser_decl_specifier_seq, so fix that. In passing, fix a misspelling in missspellings. Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk? In fact, I think I'd like to backport to 11 too, so that eventually even my system compiler stops warning about this. PR c++/103758 gcc/c-family/ChangeLog: * c-format.c (check_tokens): Accept "decl-specifier*". gcc/cp/ChangeLog: * parser.c (cp_parser_decl_specifier_seq): Replace % with %qD. gcc/testsuite/ChangeLog: * g++.dg/cpp0x/constexpr-condition.C: Adjust dg-error. OK. I know, cp/, but it seems trivial enough that I don't mind ACKing this one. jeff
Re: [PATCH] config.gcc: Obsolete m32c-rtems target
On 12/18/2021 11:01 AM, Joel Sherrill wrote: On Sat, Dec 18, 2021 at 10:13 AM Joel Sherrill wrote: On Fri, Dec 17, 2021, 9:57 PM Jeff Law wrote: On 12/17/2021 9:10 AM, Joel Sherrill wrote: --- gcc/config.gcc | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/config.gcc b/gcc/config.gcc index c8824367b13..fe93a72a16c 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -252,6 +252,7 @@ case ${target} in | cr16-*-* \ | hppa[12]*-*-hpux10* \ | hppa[12]*-*-hpux11* \ + | m32c-*-rtems* \ ) if test "x$enable_obsolete" != xyes; then echo "*** Configuration ${target} is obsolete." >&2 OK. Given that last time I tried, I couldn't even get m32c to build newlib, I'm not terribly surprised you're deprecating it from rtems. We removed it over a while back. The last release branch with it was using gcc 7 or 8. We had no indication there were any users and it led to removing some alternative implementations optimised for 16 bit CPUs. I would support deprecation of m32c-*. No complaint here. Can I commit the gcc and wwwdocs patch for m32c-rtems and let Jeff or someone follow up completely eliminating m32c? Sure. jeff
[PATCH] OpenMP front-end: allow requires dynamic_allocators
Hi all, This patch removes the "sorry" message for the OpenMP "requires dynamic_allocators" feature in C, C++ and Fortran. The clause is supposed to state that the user code will not work without the omp_alloc/omp_free and omp_init_allocator/omp_destroy_allocator and these things *are* present, so there should be no problem allowing it. I can't see any reason for our implementation to *do* anything with this statement -- it's fine for the allocators to work the same with or without it. I think this patch ought to be fine for GCC 12, but if not it can wait until stage 1 opens. I shall backport this to OG11 shortly. OK? AndrewOpenMP: allow requires dynamic_allocators There's no need to reject the dynamic_allocators requires directive because we actually do support the feature, and it doesn't have to actually "do" anything. gcc/c/ChangeLog: * c-parser.c (c_parser_omp_requires): Don't "sorry" dynamic_allocators. gcc/cp/ChangeLog: * parser.c (cp_parser_omp_requires): Don't "sorry" dynamic_allocators. gcc/fortran/ChangeLog: * openmp.c (gfc_match_omp_requires): Don't "sorry" dynamic_allocators. gcc/testsuite/ChangeLog: * gfortran.dg/gomp/requires-8.f90: Reinstate dynamic allocators requirement. diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index d7e5f051ac0..808a73f1038 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -22581,7 +22581,7 @@ c_parser_omp_requires (c_parser *parser) c_parser_skip_to_pragma_eol (parser, false); return; } - if (p) + if (p && this_req != OMP_REQUIRES_DYNAMIC_ALLOCATORS) sorry_at (cloc, "%qs clause on % directive not " "supported yet", p); if (p) diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index c2564e51e41..d55c9675c4e 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -46421,7 +46421,7 @@ cp_parser_omp_requires (cp_parser *parser, cp_token *pragma_tok) cp_parser_skip_to_pragma_eol (parser, pragma_tok); return false; } - if (p) + if (p && this_req != OMP_REQUIRES_DYNAMIC_ALLOCATORS) sorry_at (cloc, "%qs clause on % directive not " "supported yet", p); if (p) diff --git a/gcc/fortran/openmp.c b/gcc/fortran/openmp.c index 2036bc1349f..f47a44f7539 100644 --- a/gcc/fortran/openmp.c +++ b/gcc/fortran/openmp.c @@ -5373,7 +5373,8 @@ gfc_match_omp_requires (void) else goto error; - if (requires_clause & ~OMP_REQ_ATOMIC_MEM_ORDER_MASK) + if (requires_clause & ~(OMP_REQ_ATOMIC_MEM_ORDER_MASK + | OMP_REQ_DYNAMIC_ALLOCATORS)) gfc_error_now ("Sorry, %qs clause at %L on REQUIRES directive is not " "yet supported", clause, _loc); if (!gfc_omp_requires_add_clause (requires_clause, clause, _loc, NULL)) diff --git a/gcc/testsuite/gfortran.dg/gomp/requires-8.f90 b/gcc/testsuite/gfortran.dg/gomp/requires-8.f90 index 3c32ae9860e..eadfcaf8609 100644 --- a/gcc/testsuite/gfortran.dg/gomp/requires-8.f90 +++ b/gcc/testsuite/gfortran.dg/gomp/requires-8.f90 @@ -4,7 +4,7 @@ contains subroutine foo interface subroutine bar2 - !$!omp requires dynamic_allocators + !$omp requires dynamic_allocators end subroutine end interface !$omp target
Re: [PATCH] ix86: Don't match the 'm' constraint on x86_64_general_operand
On Sun, Dec 19, 2021 at 12:06:30PM -0800, H.J. Lu via Gcc-patches wrote: > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1199,6 +1199,12 @@ >(and (match_operand 0 "memory_operand") > (not (match_test "x86_extended_reg_mentioned_p (op)" > > +;; Return true if OP is a memory operand representable on ix86. > +(define_predicate "ix86_memory_operand" > + (and (match_operand 0 "memory_operand") > + (ior (match_test "mode == QImode || mode == HImode") > + (match_operand 0 "x86_64_general_operand" I must be missing something, but how can this work? x86_64_general_operand is: (define_predicate "x86_64_general_operand" (if_then_else (match_test "TARGET_64BIT") (ior (match_operand 0 "nonimmediate_operand") (match_operand 0 "x86_64_immediate_operand")) (match_operand 0 "general_operand"))) and so for non-immediates like MEMs it is just a normal memory_operand. Jakub
Re: [PATCH v3 06/12] LoongArch Port: Builtin macros.
On Mon, 2021-12-20 at 15:35 +0800, Xi Ruoyao via Gcc-patches wrote: > Hi, > > I've bootstraped the patch with my GNU-stack fix > (https://github.com/loongson/gcc/pull/62) and --enable-werror-always. > Bootstrap succeeded, but with some warnings: > > ../../gcc/config/loongarch/loongarch.md:3205:1: warning: operand 0 missing > mode? > ../../gcc/config/loongarch/loongarch.md:3282:1: warning: operand 1 missing > mode? > ../../gcc/config/loongarch/loongarch.md:3329:1: warning: operand 1 missing > mode? > ../../gcc/config/loongarch/loongarch.md:3392:1: warning: operand 0 missing > mode? > ../../gcc/config/loongarch/loongarch.md:3470:1: warning: operand 1 missing > mode? > ../../gcc/config/loongarch/loongarch.md:3520:1: warning: operand 1 missing > mode? > > A similar issue exists for MIPS (PR101593) and I've not figured out how > to silence these warnings. > > I'm trying to run GCC regression test on LoongArch, but all my previous > tries couldn't finish because a kernel panic was triggered. The kernel > was built from source pulled from > https://github.com/loongson/linux/tree/loongarch-next. > > I'm not sure if this is a bug in Huacai's patches, or a bug in mainline > kernel, or even a hardware issue. If you have some idea about the panic > please inform me (this is off-topic though). Well, I replaced the memory module and the system seems running stable... Test summary is attached. -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University Native configuration is loongarch64-unknown-linux-gnu === g++ tests === Running target unix FAIL: tmpdir-g++.dg-struct-layout-1/t033 cp_compat_x_tst.o-cp_compat_y_tst.o execute FAIL: c-c++-common/spec-barrier-1.c -std=gnu++98 (test for excess errors) FAIL: c-c++-common/spec-barrier-1.c -std=gnu++14 (test for excess errors) FAIL: c-c++-common/spec-barrier-1.c -std=gnu++17 (test for excess errors) FAIL: c-c++-common/spec-barrier-1.c -std=gnu++2a (test for excess errors) FAIL: g++.dg/tree-ssa/pr90883.C scan-tree-dump dse1 "Deleted redundant store: .*.a = {}" FAIL: g++.dg/warn/Waddress-5.C -std=gnu++98 (test for excess errors) FAIL: g++.dg/warn/Waddress-5.C -std=gnu++14 (test for excess errors) FAIL: g++.dg/warn/Waddress-5.C -std=gnu++17 (test for excess errors) FAIL: g++.dg/warn/Waddress-5.C -std=gnu++2a (test for excess errors) FAIL: g++.dg/lto/pr64076 cp_lto_pr64076_0.o-cp_lto_pr64076_1.o link, -O0 -flto -shared -fPIC FAIL: g++.dg/modules/bad-mapper-3.C -std=c++17 at line 3 (test for errors, line ) FAIL: g++.dg/modules/bad-mapper-3.C -std=c++17 (test for excess errors) FAIL: g++.dg/modules/bad-mapper-3.C -std=c++2a at line 3 (test for errors, line ) FAIL: g++.dg/modules/bad-mapper-3.C -std=c++2a (test for excess errors) FAIL: g++.dg/modules/bad-mapper-3.C -std=c++2b at line 3 (test for errors, line ) FAIL: g++.dg/modules/bad-mapper-3.C -std=c++2b (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_a.H -std=c++17 (internal compiler error: tree check: expected var_decl or function_decl or field_decl or type_decl or concept_decl or template_decl, have namespace_decl in get_merge_kind, at cp/module.cc:10074) FAIL: g++.dg/modules/xtreme-header-3_a.H -std=c++17 (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_a.H module-cmi (gcm.cache/\$srcdir/g++.dg/modules/xtreme-header-3_a.H.gcm) FAIL: g++.dg/modules/xtreme-header-3_a.H -std=c++2a (internal compiler error: tree check: expected var_decl or function_decl or field_decl or type_decl or concept_decl or template_decl, have namespace_decl in get_merge_kind, at cp/module.cc:10074) FAIL: g++.dg/modules/xtreme-header-3_a.H -std=c++2a (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_a.H module-cmi (gcm.cache/\$srcdir/g++.dg/modules/xtreme-header-3_a.H.gcm) FAIL: g++.dg/modules/xtreme-header-3_a.H -std=c++2b (internal compiler error: tree check: expected var_decl or function_decl or field_decl or type_decl or concept_decl or template_decl, have namespace_decl in get_merge_kind, at cp/module.cc:10074) FAIL: g++.dg/modules/xtreme-header-3_a.H -std=c++2b (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_a.H module-cmi (gcm.cache/\$srcdir/g++.dg/modules/xtreme-header-3_a.H.gcm) FAIL: g++.dg/modules/xtreme-header-3_b.C -std=c++17 (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_b.C -std=c++2a (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_b.C -std=c++2b (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_c.C -std=c++17 (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_c.C -std=c++2a (test for excess errors) FAIL: g++.dg/modules/xtreme-header-3_c.C -std=c++2b (test for excess errors) FAIL: g++.dg/modules/xtreme-header-5_a.H -std=c++2a (internal compiler error: tree check: expected var_decl or function_decl or field_decl or type_decl or concept_decl or template_decl, have namespace_decl in get_merge_kind, at cp/module.cc:10074) FAIL:
Re: [PATCH 1/2] libphobos: fix CET for non-glibc targets
> On 20/12/2021 01:08 Alex Xu (Hello71) via Gcc-patches > wrote: > > > On musl, linking against libphobos fails because it requires ucontext > but is not explicitly linked against it. This is caused by configure > assuming that it is implemented in assembly, but it is actually not > implemented. This silently works on other libcs because context API does > not require an external library. Thanks. Looks reasonable to me, also for backporting to gcc-11, which also has the same CET support. Iain.
Re: [GCC-WWWDocs v1] htdocs/gcc-12/changes.html: Obsolete m32c-*-rtems*
On Fri, 17 Dec 2021, Joel Sherrill wrote: > + > + > +The m32c*-*-rtems* configuration has been obsoleted and will > +be removed in a future release. Aye. Thank you! Gerald
Re: [PATCH] config.gcc: Obsolete m32c-rtems target
On Sat, 18 Dec 2021, Joel Sherrill wrote: > Can I commit the gcc and wwwdocs patch for m32c-rtems and let Jeff or > someone follow up completely eliminating m32c? Sure from my side (wwwdocs, and you probably could declare the removal as "obvious" and/or approved by Jeff). Gerald
Re: [PATCH] Fix alignment of stack slots for overaligned types [PR103500]
Alex Coplan via Gcc-patches writes: > Hi, > > This fixes PR103500 i.e. ensuring that stack slots for > passed-by-reference overaligned types are appropriately aligned. For the > testcase: > > typedef struct __attribute__((aligned(32))) { > long x,y; > } S; > S x; > void f(S); > void g(void) { f(x); } > > on AArch64, we currently generate (at -O2): > > g: > adrpx1, .LANCHOR0 > add x1, x1, :lo12:.LANCHOR0 > stp x29, x30, [sp, -48]! > mov x29, sp > ldp q0, q1, [x1] > add x0, sp, 16 > stp q0, q1, [sp, 16] > bl f > ldp x29, x30, [sp], 48 > ret > > so the stack slot for the passed-by-reference copy of the structure is > at sp + 16, and the sp is only guaranteed to be 16-byte aligned, so the > structure is only 16-byte aligned. The PCS requires the structure to be > 32-byte aligned. After this patch, we generate: > > g: > adrpx1, .LANCHOR0 > add x1, x1, :lo12:.LANCHOR0 > stp x29, x30, [sp, -64]! > mov x29, sp > add x0, sp, 47 > ldp q0, q1, [x1] > and x0, x0, -32 > stp q0, q1, [x0] > bl f > ldp x29, x30, [sp], 64 > ret > > i.e. we ensure 32-byte alignment for the struct. > > The approach taken here is similar to that in > function.c:assign_parm_setup_block where it handles the case for > DECL_ALIGN (parm) > MAX_SUPPORTED_STACK_ALIGNMENT. This in turn is > similar to the approach taken in cfgexpand.c:expand_stack_vars (where > the function calls get_dynamic_stack_size) which is the code that > handles the alignment for overaligned structures as addressable local > variables (see the related case discussed in the PR). A difference with the latter is that cfgexpand (AFAICT) always honours the DECL/TYPE_ALIGN, with LOCAL_DECL_ALIGNMENT supposedly only increasing the alignment for efficiency reasons (rather than decreasing it). So… > This patch also updates the aapcs64 test mentioned in the PR to avoid > the frontend folding away the alignment check. I've confirmed that the > execution test actually fails on aarch64-linux-gnu prior to the patch > being applied and passes afterwards. > > Bootstrapped and regtested on aarch64-linux-gnu, x86_64-linux-gnu, and > arm-linux-gnueabihf: no regressions. > > I'd appreciate any feedback. Is it OK for trunk? > > Thanks, > Alex > > gcc/ChangeLog: > > PR middle-end/103500 > * function.c (get_stack_local_alignment): Align BLKmode overaligned > types to the alignment required by the type. > (assign_stack_temp_for_type): Handle BLKmode overaligned stack > slots by allocating a larger-than-necessary buffer and aligning > the address within appropriately. > > gcc/testsuite/ChangeLog: > > PR middle-end/103500 > * gcc.target/aarch64/aapcs64/rec_align-8.c (test_pass_by_ref): > Prevent the frontend from folding our alignment check away by > using snprintf to store the pointer into a string and recovering > it with sscanf. > > diff --git a/gcc/function.c b/gcc/function.c > index 61b3bd036b8..5ed722ab959 100644 > --- a/gcc/function.c > +++ b/gcc/function.c > @@ -278,7 +278,9 @@ get_stack_local_alignment (tree type, machine_mode mode) >unsigned int alignment; > >if (mode == BLKmode) > -alignment = BIGGEST_ALIGNMENT; > +alignment = (type && TYPE_ALIGN (type) > MAX_SUPPORTED_STACK_ALIGNMENT) > + ? TYPE_ALIGN (type) > + : BIGGEST_ALIGNMENT; …I'm not sure about this calculation. Why do we only honour TYPE_ALIGN if it's greater than MAX_SUPPORTED_STACK_ALIGNMENT, and fall all the way back to BIGGEST_ALIGNMENT otherwise? It looks like on nvptx this would have the effect of honouring (say) 2048-byte alignment, but not 32-byte alignment (which falls between BIGGEST_ALIGNMENT and MAX_SUPPORTED_STACK_ALIGNMENT). Also, what about the non-BLKmode case? A type's mode cannot be more aligned than the type, but a type is allowed to be more aligned than its mode. E.g.: typedef unsigned int V __attribute__((vector_size(32))); typedef struct __attribute__((aligned(32))) { V x; } S; S x; void f(S); void g(void) { f(x); } doesn't seem to be handled by the patch. It's possible to create variations on this on aarch64 up to 2048 bits using SVE and -msve-vector-bits=N. Perhaps we should treat the old alignment calculation as a minimum and take the max of that and TYPE_ALIGN, where TYPE_ALIGN is given. >else > alignment = GET_MODE_ALIGNMENT (mode); > > @@ -872,21 +874,35 @@ assign_stack_temp_for_type (machine_mode mode, > poly_int64 size, tree type) > >p = ggc_alloc (); > > - /* We are passing an explicit alignment request to assign_stack_local. > - One side effect of that is assign_stack_local will not round SIZE > - to ensure the frame offset remains suitably aligned. > - > - So for requests which depended on the rounding
Re: [PATCH take #2] x86_64: Improve code expanded for highpart multiplications.
On Mon, Dec 20, 2021 at 12:26 PM Roger Sayle wrote: > > > Hi Uros, > Many thanks for the review. Here's a revised patch incorporating your > suggestion to use a single define_insn with a mode iterator instead of two > new near identical define_insns for SImode and DImode. I initially tried > SWI48, but a testsuite failure of pr82418.c revealed that it's more > efficient on TARGET_64BIT to use a full width multiplication than a > SImode highpart multiplication, i.e. we only want ?mulsi3_highpart on > !TARGET_64BIT, so this revised patch uses a DWIH mode iterator. > > This patch has been tested on x86_64-pc-linux-gnu by make bootstrap and > make -k check (with and without RUNTESTFLAGS="--target_board='unix{-m32}'") > with no new failures. Ok for mainline? > > > 2021-12-20 Roger Sayle > Uroš Bizjak > > gcc/ChangeLog > * config/i386/i386.md (any_mul_highpart): New code iterator. > (sgnprefix, s): Add attribute support for [su]mul_highpart. > (mul3_highpart): Delete expander. > (mul3_highpart, mulsi32_highpart_zext): > New define_insn patterns. > (define_peephole2): Tweak the register allocation for the above > instructions after reload. > > gcc/testsuite/ChangeLog > * gcc.target/i386/smuldi3_highpart.c: New test case. OK. Thanks, Uros. > > Thanks again, > Roger > -- > > > -Original Message- > > From: Uros Bizjak > > Sent: 13 December 2021 11:53 > > To: Roger Sayle > > Cc: GCC Patches > > Subject: Re: [PATCH] x86_64: Improve code expanded for highpart > > multiplications. > > > > On Fri, Dec 10, 2021 at 12:58 PM Roger Sayle > > wrote: > > > > > > > > > While working on a middle-end patch to more aggressively use highpart > > > multiplications on targets that support them, I noticed that the RTL > > > expanded by the x86 backend interacts poorly with register allocation > > > leading to suboptimal code. > > > > > > For the testcase, > > > typedef int __attribute ((mode(TI))) ti_t; long foo(long x) { > > > return ((ti_t)x * 19065) >> 64; > > > } > > > > > > we'd like to avoid: > > > foo:movq%rdi, %rax > > > movl$19065, %edx > > > imulq %rdx > > > movq%rdx, %rax > > > ret > > > > > > and would prefer: > > > foo:movl$19065, %eax > > > imulq %rdi > > > movq%rdx, %rax > > > ret > > > > > > This patch provides a pair of peephole2 transformations to tweak the > > > spills generated by reload, and at the same time replaces the current > > > define_expand with define_insn patterns using the new [su]mul_highpart > > > RTX codes. I've left the old-style patterns in the machine > > > description for the time being, but plan to remove these once my > > > planned middle-end improvements make them obsolete. > > > > > > This patch has been tested on x86_64-pc-linux-gnu (both with and > > > without the middle-end changes) by make bootstrap and make -k check > > > with no new failures. The new test case, which currently passes, > > > ensures that the code we generate isn't adversely affected by changes > > > outside > > the backend. > > > Ok for mainline? > > > > > > > > > 2021-12-10 Roger Sayle > > > > > > gcc/ChangeLog > > > * config/i386/i386.md (any_mul_highpart): New code iterator. > > > (sgnprefix, s): Add attribute support for [su]mul_highpart. > > > (mul3_highpart): Delete expander. > > > (muldi3_highpart, mulsi3_highpart, > > > mulsi32_highpart_zext): > > > New define_insn patterns. > > > (define_peephole2): Tweak the register allocation for the above > > > instructions after reload. > > > > > > gcc/testsuite/ChangeLog > > > * gcc.target/i386/smuldi3_highpart.c: New test case. > > > > Can you please merge muldi3_highpart and *mulsi3_highpart using > > SWI48 mode iterator? > > > > Uros. > > > > > > > > > > > Thanks in advance, > > > Roger > > > -- > > >
Re: [PATCH v3 03/12] LoongArch Port: Machine Decsription files.
On Fri, 2021-12-10 at 15:43 +0800, Chenghua Xu wrote: > + > +INT_MODE (OI, 32); > + > +/* Keep the OI modes from confusing the compiler into thinking > + that these modes could actually be used for computation. They are > + only holders for vectors during data movement. */ > +#define MAX_BITSIZE_MODE_ANY_INT (128) This can't prevent the compiler from using OImode for calculation. And to make things even worse, in wide-int.h we have: > #define WIDE_INT_MAX_ELTS \ > ((MAX_BITSIZE_MODE_ANY_INT + HOST_BITS_PER_WIDE_INT) / > HOST_BITS_PER_WIDE_INT) /* ... */ > class GTY(()) wide_int_storage > { > private: > HOST_WIDE_INT val[WIDE_INT_MAX_ELTS]; When the compiler uses OImode, this array will eventually be accessed out-of-bound, causing ICE on fp-uint64-convert-double-{1,2}.c. All other ports don't define it (and let genmode to determine the value automatically). I drafted https://github.com/loongson/gcc/pull/64 to remove the MAX_BITSIZE_MODE_ANY_INT definition. Not sure if it's the "proper" fix though. -- Xi Ruoyao School of Aerospace Science and Technology, Xidian University
Re: [patch, Fortran] Make REAL(KIND=16) detection more robust
Hi FX, I’m not opposed, but I’d really like to get this into the current branch. It is a much less invasive change than the power-ieee128 work. The only case I changed is the case where there is a kind 16, and there is a long double, but the long double is neither kind 10 neither kind 16. I don’t think there is such a platform currently (otherwise it wouldn’t have worked). I did a bootstrap and test on POWER, and it did not cause any regressions. So, OK for trunk. Thanks for the patch! Regards Thomas
Re: [PATCH] libiberty rust-demangle, ignore .suffix
Apologies for the delay, the email fell through the cracks somehow. The updated patch looks like it would work alright, only needs a couple tests, e.g.: https://github.com/rust-lang/rustc-demangle/blob/2811a1ad6f7c8bead2ef3671e4fdc10de1553e96/src/lib.rs#L413-L422 https://github.com/rust-lang/rustc-demangle/blob/2811a1ad6f7c8bead2ef3671e4fdc10de1553e96/src/v0.rs#L1442-L1444 Thanks, - Eddy B. On Tue, Dec 7, 2021, at 21:16, Mark Wielaard wrote: > Hi Eddy, > > On Fri, 2021-12-03 at 01:14 +0200, Eduard-Mihai Burtescu wrote: >> On Fri, Dec 3, 2021, at 00:07, Mark Wielaard wrote: >> > On Thu, Dec 02, 2021 at 07:35:17PM +0200, Eduard-Mihai Burtescu >> > wrote: >> > > That also means that for consistency, suffixes like these should >> > > be >> > > handled uniformly for both v0 and legacy (as rustc-demangle >> > > does), >> > > since LLVM doesn't distinguish. >> > >> > The problem with the legacy mangling is that dot '.' is a valid >> > character. That is why the patch only handles the v0 mangling case >> > (where dot '.' isn't valid). >> >> Thought so, that's an annoying complication - however, see later down >> why that's still not a blocker to the way rustc-demangle handles it. >> >> > > You may even be able to get Clang to generate C++ mangled symbols >> > > with ".llvm." suffixes, with enough application of LTO. This is >> > > not >> > > unlike GCC ".clone" suffixes, AFAIK. Sadly I don't think there's >> > > a >> > > way to handle both as "outside the symbol", without hardcoding >> > > ".llvm." in the implementation. >> > >> > We could use the scheme used by c++ where the .suffix is added as " >> > [clone .suffix]", it even handles multiple dots, where something >> > like >> > _Z3fooi.part.9.165493.constprop.775.31805 >> > demangles to >> > foo(int) [clone .part.9.165493] [clone .constprop.775.31805] >> > >> > I just don't think that is very useful and a little confusing. >> >> Calling it "clone" is a bit weird, but I just checked what rustc- >> demangle >> does for printing suffixes back out and it's not great either: >> - ".llvm." (and everything after it) is completely removed >> - any left-over suffixes (after demangling), if they start with ".", >> are >> not considered errors, but printed out verbatim after the >> demangling >> >> > > I don't recall the libiberty demangling API having any provisions >> > > for the demangler deciding that a mangled symbol "stops early", >> > > which would maybe allow for a more general solution. >> > >> > No, there indeed is no interface. We might introduce a new option >> > flag >> > for treating '.' as end of symbol. But do we really need that >> > flexibility? >> >> That's not what I meant - a v0 or legacy symbol is self-terminating >> in >> its parsing (or at the very least there are not dots allowed outside >> of a length-prefixed identifier), so that when you see the start of >> a valid mangled symbol, you can always find its end in the string, >> even when that end is half-way through (and is followed by suffixes >> or any other unrelated noise). >> >> What I was imagining is a way to return to the caller the number of >> chars from the start of the original string, that were demangled, >> letting the caller do something else with the rest of that string. >> (see below for how rustc-demangle already does something similar) >> >> > > Despite all that, if it helps in practice, I would still not mind >> > > this patch landing in its current form, I just wanted to share my >> > > perspective on the larger issue. >> > >> > Thanks for that. Do you happen to know what other rust demanglers >> > do? >> >> rustc-demangle's internal API returns a pair of the demangler and the >> "leftover" parts of the original string, after the end of the symbol. >> You can see here how that suffix is further checked, and kept: >> https://github.com/rust-lang/rustc-demangle/blob/2811a1ad6f7c8bead2ef3671e4fdc10de1553e96/src/lib.rs#L108-L138 > > Yes, returning a struct that returns the style detected, the demangled > string and the left over chars makes sense. But we don't have an > interface like that at the moment, and I am not sure we (currently) > have users who want this. > >> As mentioned above, ".llvm." is handled differently, just above the snippet >> linked - perhaps it was deemed too common to let it pollute the output. > > But that also makes it a slightly odd interface. I would imagine that > people would be interested in the .llvm. part. Now that just gets > dropped. > > Since we don't have an interface to return the suffix (and I find the > choice of dropping .llvm. but not other suffixes odd), I think we > should just simply always drop the .suffix. I understand now how to do > that for legacy symbols too, thanks for the hints. > > See the attached update to the patch. What do you think? > > Thanks, > > Mark > > Attachments: > * 0001-libiberty-rust-demangle-ignore-.suffix.patch
Re: Patch ping related to OpenMP
Thanks for a DWARF-related patch review (+ fix by yourself). Otherwise, still pending are the following OpenMP patches. The first one is a revised patch following the review comment and affects Fortran only. The second one is also a rather small & post-review revised patch. On 06.12.21 15:56, Tobias Burnus wrote: First, thanks for the four reviews. Secondly, I missed one patch – hence, reposted with all three pending patches: * Re: [PATCH] [gfortran] Add support for allocate clause (OpenMP 5.0). https://gcc.gnu.org/pipermail/gcc-patches/2021-November/584894.html and: On 01.12.21 17:34, Tobias Burnus wrote: * [PATCH] C, C++, Fortran, OpenMP: Add 'has_device_addr' clause to 'target' construct https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585361.html * [PATCH 00/16] OpenMP: lvalues in "map" clauses and struct handling rework https://gcc.gnu.org/pipermail/gcc-patches/2021-November/585439.html Tobias - Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
Re: [PATCH] Leverage sysroot for VxWorks
On 10/12/2021 19.24, Olivier Hainque wrote: > For the toolchains we build, this is achieved with a few > configure options like: > > --with-sysroot > --with-build-sysroot=${WIND_BASE}/target So forward-porting our private patches up until 7bf710b5116 - libstdc++: Add support for '?' in linker script globs (i.e. the commit before this one) went without problems, with no changes required in our build scripts. Then when rebasing to this one, now known as f3f923e5139 - Leverage sysroot for VxWorks the build broke as expected because I'd been using somewhat different values of --with(-build)-sysroot. However, changing those configure options as indicated above again produced a working toolchain, so this is all good. > --with-specs=%{!sysroot=*:--sysroot=%:getenv(WIND_BASE /target)} However, I'm a little confused about the purpose of this one. First, shouldn't this be '%{!-sysroot=*}', i.e. with a leading dash, since the option is --sysroot ? But whether I use one or the other, it seems that the resulting compiler ignores a --sysroot argument; if I explicitly unexport WIND_BASE and manually add a --sysroot argument that should have the same effect as above, gcc fails with WIND_BASE not defined: $ echo $WIND_BASE /usr/powerpc-wrs-vxworks/wind_base $ export -n WIND_BASE $ powerpc-wrs-vxworks-gcc --sysroot=${WIND_BASE}/target -v -E - < /dev/null Using built-in specs. powerpc-wrs-vxworks-gcc: fatal error: environment variable 'WIND_BASE' not defined compilation terminated. The specs syntax doesn't explicitly list the negative form of %{S*:X}, and I can't find any in-tree examples (though it's rather hard to grep for), so I don't even know if this is supposed to work. It's not a big deal, we can just continue to define and export WIND_BASE, but it's probably best for long-term maintenance if our build scripts and configure options are aligned with what you do on your end, so I would just like to understand this fully. Rasmus
[PATCH][pushed] jit: Fix -Wodr warning
Pushed as obvious. Martin gcc/jit/libgccjit.c:3957:8: warning: type 'struct version_info' violates the C++ One Definition Rule [-Wodr] ../../gcc/jit/libgccjit.c:3957:8: warning: type 'struct version_info' violates the C++ One Definition Rule [-Wodr] 3957 | struct version_info ../../gcc/tree-ssa-loop-ivopts.c:181: note: a different type is defined in another translation unit 181 | struct version_info gcc/jit/ChangeLog: * libgccjit.c (struct version_info): Rename to jit_version_info. (struct jit_version_info): Likewise. (gcc_jit_version_major): Likewise. (gcc_jit_version_minor): Likewise. (gcc_jit_version_patchlevel): Likewise. --- gcc/jit/libgccjit.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/jit/libgccjit.c b/gcc/jit/libgccjit.c index 5cb27a20d41..3d2d838dc4d 100644 --- a/gcc/jit/libgccjit.c +++ b/gcc/jit/libgccjit.c @@ -3954,11 +3954,11 @@ gcc_jit_context_new_rvalue_from_vector (gcc_jit_context *ctxt, static pthread_mutex_t version_mutex = PTHREAD_MUTEX_INITIALIZER; -struct version_info +struct jit_version_info { /* Default constructor. Populate via parse_basever, guarded by version_mutex. */ - version_info () + jit_version_info () { pthread_mutex_lock (_mutex); parse_basever (, , ); @@ -3974,21 +3974,21 @@ struct version_info extern int gcc_jit_version_major (void) { - version_info vi; + jit_version_info vi; return vi.major; } extern int gcc_jit_version_minor (void) { - version_info vi; + jit_version_info vi; return vi.minor; } extern int gcc_jit_version_patchlevel (void) { - version_info vi; + jit_version_info vi; return vi.patchlevel; } -- 2.34.1
[PATCH][pushed] opts: Support -Oz in -Ox option hints.
Pushed as obvious after testing. Cheers, Martin gcc/ChangeLog: * opts.c (default_options_optimization): Support -Oz in -Ox option hints. --- gcc/opts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/opts.c b/gcc/opts.c index f45ecc56726..cdd6463e49b 100644 --- a/gcc/opts.c +++ b/gcc/opts.c @@ -723,7 +723,7 @@ default_options_optimization (struct gcc_options *opts, const int optimize_val = integral_argument (opt->arg); if (optimize_val == -1) error_at (loc, "argument to %<-O%> should be a non-negative " - "integer, %, % or %"); + "integer, %, %, % or %"); else { opts->x_optimize = optimize_val; -- 2.34.1
[PATCH take #2] x86_64: Improve code expanded for highpart multiplications.
Hi Uros, Many thanks for the review. Here's a revised patch incorporating your suggestion to use a single define_insn with a mode iterator instead of two new near identical define_insns for SImode and DImode. I initially tried SWI48, but a testsuite failure of pr82418.c revealed that it's more efficient on TARGET_64BIT to use a full width multiplication than a SImode highpart multiplication, i.e. we only want ?mulsi3_highpart on !TARGET_64BIT, so this revised patch uses a DWIH mode iterator. This patch has been tested on x86_64-pc-linux-gnu by make bootstrap and make -k check (with and without RUNTESTFLAGS="--target_board='unix{-m32}'") with no new failures. Ok for mainline? 2021-12-20 Roger Sayle Uroš Bizjak gcc/ChangeLog * config/i386/i386.md (any_mul_highpart): New code iterator. (sgnprefix, s): Add attribute support for [su]mul_highpart. (mul3_highpart): Delete expander. (mul3_highpart, mulsi32_highpart_zext): New define_insn patterns. (define_peephole2): Tweak the register allocation for the above instructions after reload. gcc/testsuite/ChangeLog * gcc.target/i386/smuldi3_highpart.c: New test case. Thanks again, Roger -- > -Original Message- > From: Uros Bizjak > Sent: 13 December 2021 11:53 > To: Roger Sayle > Cc: GCC Patches > Subject: Re: [PATCH] x86_64: Improve code expanded for highpart > multiplications. > > On Fri, Dec 10, 2021 at 12:58 PM Roger Sayle > wrote: > > > > > > While working on a middle-end patch to more aggressively use highpart > > multiplications on targets that support them, I noticed that the RTL > > expanded by the x86 backend interacts poorly with register allocation > > leading to suboptimal code. > > > > For the testcase, > > typedef int __attribute ((mode(TI))) ti_t; long foo(long x) { > > return ((ti_t)x * 19065) >> 64; > > } > > > > we'd like to avoid: > > foo:movq%rdi, %rax > > movl$19065, %edx > > imulq %rdx > > movq%rdx, %rax > > ret > > > > and would prefer: > > foo:movl$19065, %eax > > imulq %rdi > > movq%rdx, %rax > > ret > > > > This patch provides a pair of peephole2 transformations to tweak the > > spills generated by reload, and at the same time replaces the current > > define_expand with define_insn patterns using the new [su]mul_highpart > > RTX codes. I've left the old-style patterns in the machine > > description for the time being, but plan to remove these once my > > planned middle-end improvements make them obsolete. > > > > This patch has been tested on x86_64-pc-linux-gnu (both with and > > without the middle-end changes) by make bootstrap and make -k check > > with no new failures. The new test case, which currently passes, > > ensures that the code we generate isn't adversely affected by changes > > outside > the backend. > > Ok for mainline? > > > > > > 2021-12-10 Roger Sayle > > > > gcc/ChangeLog > > * config/i386/i386.md (any_mul_highpart): New code iterator. > > (sgnprefix, s): Add attribute support for [su]mul_highpart. > > (mul3_highpart): Delete expander. > > (muldi3_highpart, mulsi3_highpart, mulsi32_highpart_zext): > > New define_insn patterns. > > (define_peephole2): Tweak the register allocation for the above > > instructions after reload. > > > > gcc/testsuite/ChangeLog > > * gcc.target/i386/smuldi3_highpart.c: New test case. > > Can you please merge muldi3_highpart and *mulsi3_highpart using > SWI48 mode iterator? > > Uros. > > > > > > > Thanks in advance, > > Roger > > -- > > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 4e9fae8..00361f4 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -992,11 +992,16 @@ ;; Mapping of extend operators (define_code_iterator any_extend [sign_extend zero_extend]) +;; Mapping of highpart multiply operators +(define_code_iterator any_mul_highpart [smul_highpart umul_highpart]) + ;; Prefix for insn menmonic. (define_code_attr sgnprefix [(sign_extend "i") (zero_extend "") +(smul_highpart "i") (umul_highpart "") (div "i") (udiv "")]) ;; Prefix for define_insn -(define_code_attr s [(sign_extend "s") (zero_extend "u")]) +(define_code_attr s [(sign_extend "s") (zero_extend "u") +(smul_highpart "s") (umul_highpart "u")]) (define_code_attr u [(sign_extend "") (zero_extend "u") (div "") (udiv "u")]) (define_code_attr u_bool [(sign_extend "false") (zero_extend "true") @@ -8426,20 +8431,45 @@ (set_attr "bdver1_decode" "direct") (set_attr "mode" "QI")]) -(define_expand "mul3_highpart" - [(parallel [(set (match_operand:DWIH 0 "register_operand") - (truncate:DWIH -(lshiftrt: - (mult: -(any_extend: -
[PING][PATCH, v2, 1/1, AARCH64][PR102768] aarch64: Add compiler support for Shadow Call Stack
Gentile ping for this :), thanks. Link: https://gcc.gnu.org/pipermail/gcc-patches/2021-December/586204.html Shadow Call Stack can be used to protect the return address of a function at runtime, and clang already supports this feature[1]. To enable SCS in user mode, in addition to compiler, other support is also required (as discussed in [2]). This patch only adds basic support for SCS from the compiler side, and provides convenience for users to enable SCS. For linux kernel, only the support of the compiler is required. [1] https://clang.llvm.org/docs/ShadowCallStack.html [2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102768 Signed-off-by: Dan Li gcc/c-family/ChangeLog: * c-attribs.c (handle_no_sanitize_shadow_call_stack_attribute): New. gcc/ChangeLog: * config/aarch64/aarch64-protos.h (aarch64_shadow_call_stack_enabled): New decl. * config/aarch64/aarch64.c (aarch64_shadow_call_stack_enabled): New. (aarch64_expand_prologue): Push x30 onto SCS before it's pushed onto stack. (aarch64_expand_epilogue): Pop x30 frome SCS. * config/aarch64/aarch64.h (TARGET_SUPPORT_SHADOW_CALL_STACK): New macro. (TARGET_CHECK_SCS_RESERVED_REGISTER): Likewise. * config/aarch64/aarch64.md (scs_push): New template. (scs_pop): Likewise. * defaults.h (TARGET_SUPPORT_SHADOW_CALL_STACK):New macro. * doc/extend.texi: Document -fsanitize=shadow-call-stack. * doc/invoke.texi: Document attribute. * flag-types.h (enum sanitize_code):Add SANITIZE_SHADOW_CALL_STACK. * opts-global.c (handle_common_deferred_options): Add SCS compile option check. * opts.c (finish_options): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/shadow_call_stack_1.c: New test. * gcc.target/aarch64/shadow_call_stack_2.c: New test. * gcc.target/aarch64/shadow_call_stack_3.c: New test. * gcc.target/aarch64/shadow_call_stack_4.c: New test. --- gcc/c-family/c-attribs.c | 21 + gcc/config/aarch64/aarch64-protos.h | 1 + gcc/config/aarch64/aarch64.c | 27 +++ gcc/config/aarch64/aarch64.h | 11 + gcc/config/aarch64/aarch64.md | 18 gcc/defaults.h| 4 ++ gcc/doc/extend.texi | 7 +++ gcc/doc/invoke.texi | 29 gcc/flag-types.h | 2 + gcc/opts-global.c | 6 +++ gcc/opts.c| 12 + .../gcc.target/aarch64/shadow_call_stack_1.c | 6 +++ .../gcc.target/aarch64/shadow_call_stack_2.c | 6 +++ .../gcc.target/aarch64/shadow_call_stack_3.c | 45 +++ .../gcc.target/aarch64/shadow_call_stack_4.c | 18 15 files changed, 213 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/shadow_call_stack_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/shadow_call_stack_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/shadow_call_stack_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/shadow_call_stack_4.c diff --git a/gcc/c-family/c-attribs.c b/gcc/c-family/c-attribs.c index 007b928c54b..9b3a35c06bf 100644 --- a/gcc/c-family/c-attribs.c +++ b/gcc/c-family/c-attribs.c @@ -56,6 +56,8 @@ static tree handle_cold_attribute (tree *, tree, tree, int, bool *); static tree handle_no_sanitize_attribute (tree *, tree, tree, int, bool *); static tree handle_no_sanitize_address_attribute (tree *, tree, tree, int, bool *); +static tree handle_no_sanitize_shadow_call_stack_attribute (tree *, tree, + tree, int, bool *); static tree handle_no_sanitize_thread_attribute (tree *, tree, tree, int, bool *); static tree handle_no_address_safety_analysis_attribute (tree *, tree, tree, @@ -454,6 +456,10 @@ const struct attribute_spec c_common_attribute_table[] = handle_no_sanitize_attribute, NULL }, { "no_sanitize_address",0, 0, true, false, false, false, handle_no_sanitize_address_attribute, NULL }, + { "no_sanitize_shadow_call_stack", + 0, 0, true, false, false, false, + handle_no_sanitize_shadow_call_stack_attribute, + NULL }, { "no_sanitize_thread", 0, 0, true, false, false, false, handle_no_sanitize_thread_attribute, NULL }, { "no_sanitize_undefined", 0, 0, true, false, false, false, @@ -1175,6 +1181,21 @@ handle_no_sanitize_address_attribute (tree *node, tree name, tree, int, return NULL_TREE; } +/* Handle a
[PATCH] rs6000: Replace UNSPECS with ss_plus/us_plus and ss_minus/us_minus
These four UNSPECS seems could be replaced with native RTL, and why "(set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))" in the RTL pattern, per ISA of VSCR bit 127(VECTOR Saturation, SAT): This bit is sticky; that is, once set to 1 it remains set to 1 until it is set to 0 by an mtvscr instruction. The RTL pattern set it to 0 but final ASM doesn't present it? And why not use Clobber VSCR_REGNO instead? Tested pass on P10, OK for master? Thanks. gcc/ChangeLog: * config/rs6000/altivec.md (altivec_vaddus): Replace UNSPEC_VADDU with us_plus. (altivec_vaddss): Replace UNSPEC_VADDS with ss_plus. (altivec_vsubus): Replace UNSPEC_VSUBU with us_minus. (altivec_vsubss): Replace UNSPEC_VSUBS with ss_minus. (altivec_abss_): Likewise. --- gcc/config/rs6000/altivec.md | 29 ++--- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index a057218aa28..b2909857c34 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -29,8 +29,6 @@ (define_c_enum "unspec" UNSPEC_VMHADDSHS UNSPEC_VMHRADDSHS UNSPEC_VADDCUW - UNSPEC_VADDU - UNSPEC_VADDS UNSPEC_VAVGU UNSPEC_VAVGS UNSPEC_VMULEUB @@ -61,8 +59,6 @@ (define_c_enum "unspec" UNSPEC_VSR UNSPEC_VSRO UNSPEC_VSUBCUW - UNSPEC_VSUBU - UNSPEC_VSUBS UNSPEC_VSUM4UBS UNSPEC_VSUM4S UNSPEC_VSUM2SWS @@ -517,9 +513,8 @@ (define_insn "altivec_vaddcuw" (define_insn "altivec_vaddus" [(set (match_operand:VI 0 "register_operand" "=v") -(unspec:VI [(match_operand:VI 1 "register_operand" "v") - (match_operand:VI 2 "register_operand" "v")] - UNSPEC_VADDU)) +(us_plus:VI (match_operand:VI 1 "register_operand" "v") + (match_operand:VI 2 "register_operand" "v"))) (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] "" "vaddus %0,%1,%2" @@ -527,9 +522,8 @@ (define_insn "altivec_vaddus" (define_insn "altivec_vaddss" [(set (match_operand:VI 0 "register_operand" "=v") -(unspec:VI [(match_operand:VI 1 "register_operand" "v") -(match_operand:VI 2 "register_operand" "v")] - UNSPEC_VADDS)) +(ss_plus:VI (match_operand:VI 1 "register_operand" "v") + (match_operand:VI 2 "register_operand" "v"))) (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] "VECTOR_UNIT_ALTIVEC_P (mode)" "vaddss %0,%1,%2" @@ -563,9 +557,8 @@ (define_insn "altivec_vsubcuw" (define_insn "altivec_vsubus" [(set (match_operand:VI 0 "register_operand" "=v") -(unspec:VI [(match_operand:VI 1 "register_operand" "v") -(match_operand:VI 2 "register_operand" "v")] - UNSPEC_VSUBU)) +(us_minus:VI (match_operand:VI 1 "register_operand" "v") +(match_operand:VI 2 "register_operand" "v"))) (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] "VECTOR_UNIT_ALTIVEC_P (mode)" "vsubus %0,%1,%2" @@ -573,9 +566,8 @@ (define_insn "altivec_vsubus" (define_insn "altivec_vsubss" [(set (match_operand:VI 0 "register_operand" "=v") -(unspec:VI [(match_operand:VI 1 "register_operand" "v") -(match_operand:VI 2 "register_operand" "v")] - UNSPEC_VSUBS)) +(ss_minus:VI (match_operand:VI 1 "register_operand" "v") +(match_operand:VI 2 "register_operand" "v"))) (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] "VECTOR_UNIT_ALTIVEC_P (mode)" "vsubss %0,%1,%2" @@ -3480,9 +3472,8 @@ (define_expand "altivec_absv4sf2" (define_expand "altivec_abss_" [(set (match_dup 2) (vec_duplicate:VI (const_int 0))) (parallel [(set (match_dup 3) - (unspec:VI [(match_dup 2) - (match_operand:VI 1 "register_operand" "v")] - UNSPEC_VSUBS)) + (ss_minus:VI (match_dup 2) + (match_operand:VI 1 "register_operand" "v"))) (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))]) (set (match_operand:VI 0 "register_operand" "=v") -- 2.27.0
Re: [PATCH] x86_64: Ignore zero width bitfields in ABI and issue -Wpsabi warning about C zero width bitfield ABI changes [PR102024]
On Wed, Dec 15, 2021 at 3:50 PM Jakub Jelinek wrote: > > On Mon, Nov 29, 2021 at 05:25:30AM -0700, H.J. Lu wrote: > > > I'd like to ping this patch, but perhaps first it would be nice to discuss > > > it in the x86-64 psABI group. > > > The current psABI doesn't seem to mention zero sized bitfields at all > > > explicitly, so perhaps theoretically they should be treated as INTEGER > > > class, > > > but if they are at positions multiple of 64 bits, then it is unclear into > > > which eightbyte they should be considered, whether the previous one if any > > > or the next one if any. I guess similar problem is for zero sized > > > structures, but those should according to algorithm have NO_CLASS and so > > > it > > > doesn't really make a difference. And, no compiler I'm aware of treats > > > the zero sized bitfields at 64 bit boundaries as INTEGER class, LLVM/ICC > > > are > > > ignoring such bitfields everywhere, GCC ignores them at those boundaries > > > (and used to ignore them in C++ everywhere). I guess my preferred > > > solution > > > would be to say explicitly that zero sized bitfields are NO_CLASS. > > > I'm not a member of the google x86-64 psABI group, can somebody please > > > raise > > > it there? > > > > https://groups.google.com/g/x86-64-abi/c/OYeWs14WHQ4 > > Thanks. > I see nobody commented on Micha's post there. > > Here is a patch that implements it in GCC, i.e. C++ doesn't change ABI (at > least > not from the past few releases) and C does for GCC: > > 2021-12-15 Jakub Jelinek > > PR target/102024 > * config/i386/i386.c (classify_argument): Add zero_width_bitfields > argument, when seeing DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD bitfields, > always ignore them, when seeing other zero sized bitfields, either > set zero_width_bitfields to 1 and ignore it or if equal to 2 process > it. Pass it to recursive calls. Add wrapper > with old arguments and diagnose ABI differences for C structures > with zero width bitfields. Formatting fixes. > > * gcc.target/i386/pr102024.c: New test. > * g++.target/i386/pr102024.C: New test. Please get a signoff on the ABI change (perhaps HJ can help here), I'll approve the implementation after that. Uros. > > --- gcc/config/i386/i386.c.jj 2021-12-10 17:00:06.024369219 +0100 > +++ gcc/config/i386/i386.c 2021-12-15 15:04:49.245148023 +0100 > @@ -2065,7 +2065,8 @@ merge_classes (enum x86_64_reg_class cla > > static int > classify_argument (machine_mode mode, const_tree type, > - enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset) > + enum x86_64_reg_class classes[MAX_CLASSES], int bit_offset, > + int _width_bitfields) > { >HOST_WIDE_INT bytes > = mode == BLKmode ? int_size_in_bytes (type) : (int) GET_MODE_SIZE > (mode); > @@ -2123,6 +2124,16 @@ classify_argument (machine_mode mode, co > misaligned integers. */ > if (DECL_BIT_FIELD (field)) > { > + if (integer_zerop (DECL_SIZE (field))) > + { > + if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field)) > + continue; > + if (zero_width_bitfields != 2) > + { > + zero_width_bitfields = 1; > + continue; > + } > + } > for (i = (int_bit_position (field) > + (bit_offset % 64)) / 8 / 8; >i < ((int_bit_position (field) + (bit_offset % 64)) > @@ -2160,7 +2171,8 @@ classify_argument (machine_mode mode, co > num = classify_argument (TYPE_MODE (type), type, >subclasses, >(int_bit_position (field) > - + bit_offset) % 512); > + + bit_offset) % 512, > + zero_width_bitfields); > if (!num) > return 0; > pos = (int_bit_position (field) > @@ -2178,7 +2190,8 @@ classify_argument (machine_mode mode, co > { > int num; > num = classify_argument (TYPE_MODE (TREE_TYPE (type)), > -TREE_TYPE (type), subclasses, > bit_offset); > +TREE_TYPE (type), subclasses, bit_offset, > +zero_width_bitfields); > if (!num) > return 0; > > @@ -2211,7 +2224,7 @@ classify_argument (machine_mode mode, co > > num = classify_argument (TYPE_MODE (TREE_TYPE (field)), >TREE_TYPE
Re: [PATCH] ix86: Don't match the 'm' constraint on x86_64_general_operand
On Sun, Dec 19, 2021 at 9:06 PM H.J. Lu wrote: > > x86_64_general_operand is different from general_operand for 64-bit > target. To avoid LRA selecting a memory operand which doesn't satisfy > x86_64_general_operand for 64-bit target: > > 1. Add a 'BM' constraint which is similar to the 'm' constraint, but > checks x86_64_general_operand for integers > 16 bits. > 2. Replace the 'm' constraint on x86_64_general_operand with the 'BM' > constraint. > > gcc/ > > PR target/103762 > * config/i386/constraints.md (BM): New constraint. > * config/i386/i386.md: Replace the 'm' constraint on > x86_64_general_operand with the 'BM' constraint. > * config/i386/predicates.md (ix86_memory_operand): New predicate. > > gcc/testsuite/ > > * gcc.target/i386/pr103762-1a.c: New test. > * gcc.target/i386/pr103762-1b.c: Likewise. > * gcc.target/i386/pr103762-1c.c: Likewise. > --- > gcc/config/i386/constraints.md | 5 + > gcc/config/i386/i386.md | 64 +- > gcc/config/i386/predicates.md | 6 + > gcc/testsuite/gcc.target/i386/pr103762-1a.c | 647 > gcc/testsuite/gcc.target/i386/pr103762-1b.c | 7 + > gcc/testsuite/gcc.target/i386/pr103762-1c.c | 7 + > 6 files changed, 704 insertions(+), 32 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr103762-1a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103762-1b.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr103762-1c.c > > diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md > index 6b291a02d88..6677285e518 100644 > --- a/gcc/config/i386/constraints.md > +++ b/gcc/config/i386/constraints.md > @@ -168,6 +168,7 @@ > ;; z Constant call address operand. > ;; C Integer SSE constant with all bits set operand. > ;; F Floating-point SSE constant with all bits set operand. > +;; M ix86 memory operand. > > (define_constraint "Bf" >"@internal Flags register operand." > @@ -232,6 +233,10 @@ >(and (match_test "TARGET_SSE") > (match_operand 0 "float_vector_all_ones_operand"))) > > +(define_constraint "BM" > + "@internal ix86 memory operand." > + (match_operand 0 "ix86_memory_operand")) > + > ;; Integer constant constraints. > (define_constraint "Wb" >"Integer constant in the range 0 @dots{} 7, for 8-bit shifts." > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index d25453fe574..4f30819a1d9 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -1385,7 +1385,7 @@ > (define_insn "*cmp_1" >[(set (reg FLAGS_REG) > (compare (match_operand:SWI 0 "nonimmediate_operand" "m,") > -(match_operand:SWI 1 "" ",m")))] > +(match_operand:SWI 1 "" ",BM")))] >"ix86_match_ccmode (insn, CCmode)" >"cmp{}\t{%1, %0|%0, %1}" >[(set_attr "type" "icmp") > @@ -1395,7 +1395,7 @@ >[(set (reg FLAGS_REG) > (compare > (minus:SWI (match_operand:SWI 0 "nonimmediate_operand" "m,") > -(match_operand:SWI 1 "" ",m")) > +(match_operand:SWI 1 "" ",BM")) > (const_int 0)))] >"ix86_match_ccmode (insn, CCGOCmode)" >"cmp{}\t{%1, %0|%0, %1}" > @@ -5653,7 +5653,7 @@ >[(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,r,r") > (plus:SWI48 > (match_operand:SWI48 1 "nonimmediate_operand" "%0,0,r,r") > - (match_operand:SWI48 2 "x86_64_general_operand" "re,m,0,le"))) > + (match_operand:SWI48 2 "x86_64_general_operand" "re,BM,0,le"))) > (clobber (reg:CC FLAGS_REG))] >"ix86_binary_operator_ok (PLUS, mode, operands)" > { > @@ -5709,7 +5709,7 @@ >[(set (match_operand:DI 0 "register_operand" "=r,r,r") > (zero_extend:DI > (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,r,r") > - (match_operand:SI 2 "x86_64_general_operand" "rme,0,le" > + (match_operand:SI 2 "x86_64_general_operand" > "rBMe,0,le" > (clobber (reg:CC FLAGS_REG))] >"TARGET_64BIT && ix86_binary_operator_ok (PLUS, SImode, operands)" > { > @@ -5967,7 +5967,7 @@ > (compare > (plus:SWI > (match_operand:SWI 1 "nonimmediate_operand" "%0,0,") > - (match_operand:SWI 2 "" ",m,0")) > + (match_operand:SWI 2 "" ",BM,0")) > (const_int 0))) > (set (match_operand:SWI 0 "nonimmediate_operand" "=m,,") > (plus:SWI (match_dup 1) (match_dup 2)))] > @@ -6012,7 +6012,7 @@ >[(set (reg FLAGS_REG) > (compare > (plus:SI (match_operand:SI 1 "nonimmediate_operand" "%0,r") > - (match_operand:SI 2 "x86_64_general_operand" "rme,0")) > + (match_operand:SI 2 "x86_64_general_operand" "rBMe,0")) > (const_int 0))) > (set (match_operand:DI 0 "register_operand" "=r,r") > (zero_extend:DI (plus:SI (match_dup 1) (match_dup 2] > @@ -6097,7