Pushed to trunk now.

On Thu, 20 Jun 2024 at 16:28, Jonathan Wakely <jwak...@redhat.com> wrote:
>
> I think the new conditions are correct. They're certainly an improvment
> on just checking __is_scalar without considering what we're assigning it
> to.
>
> Tested x86_64-linux.
>
> -- >8 --
>
> As noted in the PR, the optimization used for scalar types in std::fill
> and std::fill_n is non-conforming, because it doesn't consider that
> assigning a scalar type might have non-trivial side effects which are
> affected by the optimization.
>
> By changing the condition under which the optimization is done we ensure
> it's only performed when safe to do so, and we also enable it for
> additional types, which was the original subject of the PR.
>
> Instead of two overloads using __enable_if<__is_scalar<T>::__value, R>
> we can combine them into one and create a local variable which is either
> a local copy of __value or another reference to it, depending on whether
> the optimization is allowed.
>
> This removes a use of std::__is_scalar, which is a step towards fixing
> PR 115497 by removing std::__is_pointer from <bits/cpp_type_traits.h>
>
> libstdc++-v3/ChangeLog:
>
>         PR libstdc++/109150
>         * include/bits/stl_algobase.h (__fill_a1): Combine the
>         !__is_scalar and __is_scalar overloads into one and rewrite the
>         condition used to decide whether to perform the load outside the
>         loop.
>         * testsuite/25_algorithms/fill/109150.cc: New test.
>         * testsuite/25_algorithms/fill_n/109150.cc: New test.
> ---
>  libstdc++-v3/include/bits/stl_algobase.h      | 75 ++++++++++++-------
>  .../testsuite/25_algorithms/fill/109150.cc    | 62 +++++++++++++++
>  .../testsuite/25_algorithms/fill_n/109150.cc  | 62 +++++++++++++++
>  3 files changed, 171 insertions(+), 28 deletions(-)
>  create mode 100644 libstdc++-v3/testsuite/25_algorithms/fill/109150.cc
>  create mode 100644 libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc
>
> diff --git a/libstdc++-v3/include/bits/stl_algobase.h 
> b/libstdc++-v3/include/bits/stl_algobase.h
> index d831e0e9883..1a0f8c14073 100644
> --- a/libstdc++-v3/include/bits/stl_algobase.h
> +++ b/libstdc++-v3/include/bits/stl_algobase.h
> @@ -929,28 +929,39 @@ _GLIBCXX_END_NAMESPACE_CONTAINER
>  #define _GLIBCXX_MOVE_BACKWARD3(_Tp, _Up, _Vp) std::copy_backward(_Tp, _Up, 
> _Vp)
>  #endif
>
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wc++17-extensions"
>    template<typename _ForwardIterator, typename _Tp>
>      _GLIBCXX20_CONSTEXPR
> -    inline typename
> -    __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value, void>::__type
> +    inline void
>      __fill_a1(_ForwardIterator __first, _ForwardIterator __last,
>               const _Tp& __value)
>      {
> -      for (; __first != __last; ++__first)
> -       *__first = __value;
> -    }
> +      // We can optimize this loop by moving the load from __value outside
> +      // the loop, but only if we know that making that copy is trivial,
> +      // and the assignment in the loop is also trivial (so that the identity
> +      // of the operand doesn't matter).
> +      const bool __load_outside_loop =
> +#if __has_builtin(__is_trivially_constructible) \
> +      && __has_builtin(__is_trivially_assignable)
> +           __is_trivially_constructible(_Tp, const _Tp&)
> +           && __is_trivially_assignable(__decltype(*__first), const _Tp&)
> +#else
> +           __is_trivially_copyable(_Tp)
> +           && __is_same(_Tp, __typeof__(*__first))
> +#endif
> +           && sizeof(_Tp) <= sizeof(long long);
>
> -  template<typename _ForwardIterator, typename _Tp>
> -    _GLIBCXX20_CONSTEXPR
> -    inline typename
> -    __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value, void>::__type
> -    __fill_a1(_ForwardIterator __first, _ForwardIterator __last,
> -             const _Tp& __value)
> -    {
> -      const _Tp __tmp = __value;
> +      // When the condition is true, we use a copy of __value,
> +      // otherwise we just use another reference.
> +      typedef typename __gnu_cxx::__conditional_type<__load_outside_loop,
> +                                                    const _Tp,
> +                                                    const _Tp&>::__type _Up;
> +      _Up __val(__value);
>        for (; __first != __last; ++__first)
> -       *__first = __tmp;
> +       *__first = __val;
>      }
> +#pragma GCC diagnostic pop
>
>    // Specialization: for char types we can use memset.
>    template<typename _Tp>
> @@ -1079,28 +1090,36 @@ _GLIBCXX_END_NAMESPACE_CONTAINER
>    __size_to_integer(__float128 __n) { return (long long)__n; }
>  #endif
>
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wc++17-extensions"
>    template<typename _OutputIterator, typename _Size, typename _Tp>
>      _GLIBCXX20_CONSTEXPR
> -    inline typename
> -    __gnu_cxx::__enable_if<!__is_scalar<_Tp>::__value, 
> _OutputIterator>::__type
> +    inline _OutputIterator
>      __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value)
>      {
> -      for (; __n > 0; --__n, (void) ++__first)
> -       *__first = __value;
> -      return __first;
> -    }
> +      // See std::__fill_a1 for explanation of this condition.
> +      const bool __load_outside_loop =
> +#if __has_builtin(__is_trivially_constructible) \
> +      && __has_builtin(__is_trivially_assignable)
> +           __is_trivially_constructible(_Tp, const _Tp&)
> +           && __is_trivially_assignable(__decltype(*__first), const _Tp&)
> +#else
> +           __is_trivially_copyable(_Tp)
> +           && __is_same(_Tp, __typeof__(*__first))
> +#endif
> +           && sizeof(_Tp) <= sizeof(long long);
>
> -  template<typename _OutputIterator, typename _Size, typename _Tp>
> -    _GLIBCXX20_CONSTEXPR
> -    inline typename
> -    __gnu_cxx::__enable_if<__is_scalar<_Tp>::__value, 
> _OutputIterator>::__type
> -    __fill_n_a1(_OutputIterator __first, _Size __n, const _Tp& __value)
> -    {
> -      const _Tp __tmp = __value;
> +      // When the condition is true, we use a copy of __value,
> +      // otherwise we just use another reference.
> +      typedef typename __gnu_cxx::__conditional_type<__load_outside_loop,
> +                                                    const _Tp,
> +                                                    const _Tp&>::__type _Up;
> +      _Up __val(__value);
>        for (; __n > 0; --__n, (void) ++__first)
> -       *__first = __tmp;
> +       *__first = __val;
>        return __first;
>      }
> +#pragma GCC diagnostic pop
>
>    template<typename _Ite, typename _Seq, typename _Cat, typename _Size,
>            typename _Tp>
> diff --git a/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc 
> b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc
> new file mode 100644
> index 00000000000..9491a998ff0
> --- /dev/null
> +++ b/libstdc++-v3/testsuite/25_algorithms/fill/109150.cc
> @@ -0,0 +1,62 @@
> +// { dg-do run }
> +
> +// Test the problematic cases identified in PR libstdc++/109150
> +// where the previous std::fill was non-conforming.
> +
> +#include <algorithm>
> +#include <testsuite_hooks.h>
> +
> +const int global = 0;
> +
> +struct X {
> +  void operator=(const int& i) { VERIFY(&i == &global); }
> +};
> +
> +void
> +test_identity_matters()
> +{
> +  X x;
> +  // Assigning int to X has non-trivial side effects, so we cannot
> +  // hoist the load outside the loop, we have to do exactly what the
> +  // standard says to do.
> +  std::fill(&x, &x+1, global);
> +}
> +
> +struct Y {
> +  int i;
> +  void operator=(int ii) { i = ii + 1; }
> +};
> +
> +void
> +test_self_aliasing()
> +{
> +  Y y[2] = { };
> +  // Assigning int to X has non-trivial side effects, altering the value
> +  // used to fill the later elements. Must not load it outside the loop.
> +  std::fill(y, y+2, y[0].i);
> +  VERIFY(y[1].i == 2);
> +}
> +
> +struct Z
> +{
> +  Z() { }
> +#if __cplusplus >= 201103L
> +  explicit Z(const Z&) = default;
> +#endif
> +};
> +
> +void
> +test_explicit_copy_ctor()
> +{
> +  Z z;
> +  // The optimization that copies the fill value must use 
> direct-initialization
> +  // otherwise this case would be ill-formed due to the explicit constructor.
> +  std::fill(&z, &z, z);
> +}
> +
> +int main()
> +{
> +  test_identity_matters();
> +  test_self_aliasing();
> +  test_explicit_copy_ctor();
> +}
> diff --git a/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc 
> b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc
> new file mode 100644
> index 00000000000..13bb31c9344
> --- /dev/null
> +++ b/libstdc++-v3/testsuite/25_algorithms/fill_n/109150.cc
> @@ -0,0 +1,62 @@
> +// { dg-do run }
> +
> +// Test the problematic cases identified in PR libstdc++/109150
> +// where the previous std::fill_n was non-conforming.
> +
> +#include <algorithm>
> +#include <testsuite_hooks.h>
> +
> +const int global = 0;
> +
> +struct X {
> +  void operator=(const int& i) { VERIFY(&i == &global); }
> +};
> +
> +void
> +test_identity_matters()
> +{
> +  X x;
> +  // Assigning int to X has non-trivial side effects, so we cannot
> +  // hoist the load outside the loop, we have to do exactly what the
> +  // standard says to do.
> +  std::fill_n(&x, 1, global);
> +}
> +
> +struct Y {
> +  int i;
> +  void operator=(int ii) { i = ii + 1; }
> +};
> +
> +void
> +test_self_aliasing()
> +{
> +  Y y[2] = { };
> +  // Assigning int to X has non-trivial side effects, altering the value
> +  // used to fill the later elements. Must not load it outside the loop.
> +  std::fill_n(y, 2, y[0].i);
> +  VERIFY(y[1].i == 2);
> +}
> +
> +struct Z
> +{
> +  Z() { }
> +#if __cplusplus >= 201103L
> +  explicit Z(const Z&) = default;
> +#endif
> +};
> +
> +void
> +test_explicit_copy_ctor()
> +{
> +  Z z;
> +  // The optimization that copies the fill value must use 
> direct-initialization
> +  // otherwise this case would be ill-formed due to the explicit constructor.
> +  std::fill_n(&z, 1, z);
> +}
> +
> +int main()
> +{
> +  test_identity_matters();
> +  test_self_aliasing();
> +  test_explicit_copy_ctor();
> +}
> --
> 2.45.2
>

Reply via email to