Pushed to trunk now.
On Thu, 20 Jun 2024 at 16:28, Jonathan Wakely <jwak...@redhat.com> wrote:
>
> I think the new conditions are correct. They're certainly an improvment
> on just checking __is_scalar without considering what we're assigning it
> to.
>
> Tested x86_64-linux.
>
> -- >8 --
>
> As noted in the PR, the optimization used for scalar types in std::fill
> and std::fill_n is non-conforming, because it doesn't consider that
> assigning a scalar type might have non-trivial side effects which are
> affected by the optimization.
>
> By changing the condition under which the optimization is done we ensure
> it's only performed when safe to do so, and we also enable it for
> additional types, which was the original subject of the PR.
>
> Instead of two overloads using __enable_if<__is_scalar<T>::__value, R>
> we can combine them into one and create a local variable which is either
> a local copy of __value or another reference to it, depending on whether
> the optimization is allowed.
>
> This removes a use of std::__is_scalar, which is a step towards fixing
> PR 115497 by removing std::__is_pointer from <bits/cpp_type_traits.h>
>
> libstdc++-v3/ChangeLog:
>
> PR libstdc++/109150
> * include/bits/stl_algobase.h (__fill_a1): Combine the
> !__is_scalar and __is_scalar overloads into one and rewrite the
> condition used to decide whether to perform the load outside the
> loop.
> * testsuite/25_algorithms/fill/109150.cc: New test.
> * testsuite/25_algorithms/fill_n/109150.cc: New test.
> ---
> libstdc++-v3/include/bits/stl_algobase.h | 75 ++++++++++++-------
> .../testsuite/25_algorithms/fill/109150.cc | 62 +++++++++++++++
> .../testsuite/25_algorithms/fill_n/109150.cc | 62 +++++++++++++++
> 3 files changed, 171 insertions(+), 28 deletions(-)
> create mode 100644 libstdc++-v3/testsuite/25_algorithms/fill/109150.cc
> create mode 100644 libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc
>
> diff --git a/libstdc++-v3/include/bits/stl_algobase.h
> b/libstdc++-v3/include/bits/stl_algobase.h
> index d831e0e9883..1a0f8c14073 100644
> --- a/libstdc++-v3/include/bits/stl_algobase.h
> +++ b/libstdc++-v3/include/bits/stl_algobase.h
> @@ -929,28 +929,39 @@ _GLIBCXX_END_NAMESPACE_CONTAINER
> #define _GLIBCXX_MOVE_BACKWARD3(_Tp, _Up, _Vp) std::copy_backward(_Tp, _Up,
> _Vp)
> #endif
>
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wc++17-extensions"
> template<typename _ForwardIterator, typename _Tp>
> _GLIBCXX20_CONSTEXPR
> - inline typename
> - __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value, void>::__type
> + inline void
> __fill_a1(_ForwardIterator __first, _ForwardIterator __last,
> const _Tp& __value)
> {
> - for (; __first != __last; ++__first)
> - *__first = __value;
> - }
> + // We can optimize this loop by moving the load from __value outside
> + // the loop, but only if we know that making that copy is trivial,
> + // and the assignment in the loop is also trivial (so that the identity
> + // of the operand doesn't matter).
> + const bool __load_outside_loop =
> +#if __has_builtin(__is_trivially_constructible) \
> + && __has_builtin(__is_trivially_assignable)
> + __is_trivially_constructible(_Tp, const _Tp&)
> + && __is_trivially_assignable(__decltype(*__first), const _Tp&)
> +#else
> + __is_trivially_copyable(_Tp)
> + && __is_same(_Tp, __typeof__(*__first))
> +#endif
> + && sizeof(_Tp) <= sizeof(long long);
>
> - template<typename _ForwardIterator, typename _Tp>
> - _GLIBCXX20_CONSTEXPR
> - inline typename
> - __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value, void>::__type
> - __fill_a1(_ForwardIterator __first, _ForwardIterator __last,
> - const _Tp& __value)
> - {
> - const _Tp __tmp = __value;
> + // When the condition is true, we use a copy of __value,
> + // otherwise we just use another reference.
> + typedef typename __gnu_cxx::__conditional_type<__load_outside_loop,
> + const _Tp,
> + const _Tp&>::__type _Up;
> + _Up __val(__value);
> for (; __first != __last; ++__first)
> - *__first = __tmp;
> + *__first = __val;
> }
> +#pragma GCC diagnostic pop
>
> // Specialization: for char types we can use memset.
> template<typename _Tp>
> @@ -1079,28 +1090,36 @@ _GLIBCXX_END_NAMESPACE_CONTAINER
> __size_to_integer(__float128 __n) { return (long long)__n; }
> #endif
>
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wc++17-extensions"
> template<typename _OutputIterator, typename _Size, typename _Tp>
> _GLIBCXX20_CONSTEXPR
> - inline typename
> - __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value,
> _OutputIterator>::__type
> + inline _OutputIterator
> __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value)
> {
> - for (; __n > 0; --__n, (void) ++__first)
> - *__first = __value;
> - return __first;
> - }
> + // See std::__fill_a1 for explanation of this condition.
> + const bool __load_outside_loop =
> +#if __has_builtin(__is_trivially_constructible) \
> + && __has_builtin(__is_trivially_assignable)
> + __is_trivially_constructible(_Tp, const _Tp&)
> + && __is_trivially_assignable(__decltype(*__first), const _Tp&)
> +#else
> + __is_trivially_copyable(_Tp)
> + && __is_same(_Tp, __typeof__(*__first))
> +#endif
> + && sizeof(_Tp) <= sizeof(long long);
>
> - template<typename _OutputIterator, typename _Size, typename _Tp>
> - _GLIBCXX20_CONSTEXPR
> - inline typename
> - __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value,
> _OutputIterator>::__type
> - __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value)
> - {
> - const _Tp __tmp = __value;
> + // When the condition is true, we use a copy of __value,
> + // otherwise we just use another reference.
> + typedef typename __gnu_cxx::__conditional_type<__load_outside_loop,
> + const _Tp,
> + const _Tp&>::__type _Up;
> + _Up __val(__value);
> for (; __n > 0; --__n, (void) ++__first)
> - *__first = __tmp;
> + *__first = __val;
> return __first;
> }
> +#pragma GCC diagnostic pop
>
> template<typename _Ite, typename _Seq, typename _Cat, typename _Size,
> typename _Tp>
> diff --git a/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc
> b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc
> new file mode 100644
> index 00000000000..9491a998ff0
> --- /dev/null
> +++ b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc
> @@ -0,0 +1,62 @@
> +// { dg-do run }
> +
> +// Test the problematic cases identified in PR libstdc++/109150
> +// where the previous std::fill was non-conforming.
> +
> +#include <algorithm>
> +#include <testsuite_hooks.h>
> +
> +const int global = 0;
> +
> +struct X {
> + void operator=(const int& i) { VERIFY(&i == &global); }
> +};
> +
> +void
> +test_identity_matters()
> +{
> + X x;
> + // Assigning int to X has non-trivial side effects, so we cannot
> + // hoist the load outside the loop, we have to do exactly what the
> + // standard says to do.
> + std::fill(&x, &x+1, global);
> +}
> +
> +struct Y {
> + int i;
> + void operator=(int ii) { i = ii + 1; }
> +};
> +
> +void
> +test_self_aliasing()
> +{
> + Y y[2] = { };
> + // Assigning int to X has non-trivial side effects, altering the value
> + // used to fill the later elements. Must not load it outside the loop.
> + std::fill(y, y+2, y[0].i);
> + VERIFY(y[1].i == 2);
> +}
> +
> +struct Z
> +{
> + Z() { }
> +#if __cplusplus >= 201103L
> + explicit Z(const Z&) = default;
> +#endif
> +};
> +
> +void
> +test_explicit_copy_ctor()
> +{
> + Z z;
> + // The optimization that copies the fill value must use
> direct-initialization
> + // otherwise this case would be ill-formed due to the explicit constructor.
> + std::fill(&z, &z, z);
> +}
> +
> +int main()
> +{
> + test_identity_matters();
> + test_self_aliasing();
> + test_explicit_copy_ctor();
> +}
> diff --git a/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc
> b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc
> new file mode 100644
> index 00000000000..13bb31c9344
> --- /dev/null
> +++ b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc
> @@ -0,0 +1,62 @@
> +// { dg-do run }
> +
> +// Test the problematic cases identified in PR libstdc++/109150
> +// where the previous std::fill_n was non-conforming.
> +
> +#include <algorithm>
> +#include <testsuite_hooks.h>
> +
> +const int global = 0;
> +
> +struct X {
> + void operator=(const int& i) { VERIFY(&i == &global); }
> +};
> +
> +void
> +test_identity_matters()
> +{
> + X x;
> + // Assigning int to X has non-trivial side effects, so we cannot
> + // hoist the load outside the loop, we have to do exactly what the
> + // standard says to do.
> + std::fill_n(&x, 1, global);
> +}
> +
> +struct Y {
> + int i;
> + void operator=(int ii) { i = ii + 1; }
> +};
> +
> +void
> +test_self_aliasing()
> +{
> + Y y[2] = { };
> + // Assigning int to X has non-trivial side effects, altering the value
> + // used to fill the later elements. Must not load it outside the loop.
> + std::fill_n(y, 2, y[0].i);
> + VERIFY(y[1].i == 2);
> +}
> +
> +struct Z
> +{
> + Z() { }
> +#if __cplusplus >= 201103L
> + explicit Z(const Z&) = default;
> +#endif
> +};
> +
> +void
> +test_explicit_copy_ctor()
> +{
> + Z z;
> + // The optimization that copies the fill value must use
> direct-initialization
> + // otherwise this case would be ill-formed due to the explicit constructor.
> + std::fill_n(&z, 1, z);
> +}
> +
> +int main()
> +{
> + test_identity_matters();
> + test_self_aliasing();
> + test_explicit_copy_ctor();
> +}
> --
> 2.45.2
>