Reproducer

% clang++-16 -std=c++17 -Wfatal-errors -Wall -Wextra -Werror -O1 -o
fails 
'-DHWY_DISABLED_TARGETS=(HWY_NEON|HWY_SVE|HWY_SVE2|HWY_SVE_256|HWY_SVE2_128)'
math_test4.cc -lhwy -lhwy_contrib -lhwy_test
% valgrind ./fails
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_

// Memory allocator with support for alignment and offsets.

#include <memory>
#include <utility>

#include "hwy/base.h"

namespace hwy {

// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
// requires a literal. This matches typical L1 cache line sizes, which prevents
// false sharing.
#define HWY_ALIGNMENT 64

// Pointers to functions equivalent to malloc/free with an opaque void* passed
// to them.
using AllocPtr = void* (*)(void* opaque, size_t bytes);
using FreePtr = void (*)(void* opaque, void* memory);

// Returns null or a pointer to at least `payload_size` (which can be zero)
// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
// memory or malloc() if it is null.
HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
                                         AllocPtr alloc_ptr, void* opaque_ptr);

// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
// must have been returned from a previous call to `AllocateAlignedBytes`.
// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
// `free_ptr` function is null, uses the default free().
HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
                                    FreePtr free_ptr, void* opaque_ptr);

// Class that deletes the aligned pointer passed to operator() calling the
// destructor before freeing the pointer. This is equivalent to the
// std::default_delete but for aligned objects. For a similar deleter equivalent
// to free() for aligned memory see AlignedFreer().
class AlignedDeleter {
 public:
  AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
  AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}

  template <typename T>
  void operator()(T* aligned_pointer) const {
    return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
                              TypedArrayDeleter<T>);
  }

 private:
  template <typename T>
  static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
    size_t elems = size_in_bytes / sizeof(T);
    for (size_t i = 0; i < elems; i++) {
      // Explicitly call the destructor on each element.
      (static_cast<T*>(ptr) + i)->~T();
    }
  }

  // Function prototype that calls the destructor for each element in a typed
  // array. TypeArrayDeleter<T> would match this prototype.
  using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);

  HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
                                               FreePtr free_ptr,
                                               void* opaque_ptr,
                                               ArrayDeleter deleter);

  FreePtr free_;
  void* opaque_ptr_;
};

// Unique pointer to T with custom aligned deleter. This can be a single
// element U or an array of element if T is a U[]. The custom aligned deleter
// will call the destructor on U or each element of a U[] in the array case.
template <typename T>
using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;

// Aligned memory equivalent of make_unique<T> using the custom allocators
// alloc/free with the passed `opaque` pointer. This function calls the
// constructor with the passed Args... and calls the destructor of the object
// when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
                                               void* opaque, Args&&... args) {
  T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
                             AlignedDeleter(free, opaque));
}

// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
// functions.
template <typename T, typename... Args>
AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
  T* ptr = static_cast<T*>(AllocateAlignedBytes(
      sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
                             AlignedDeleter());
}

// Helpers for array allocators (avoids overflow)
namespace detail {

// Returns x such that 1u << x == n (if n is a power of two).
static inline constexpr size_t ShiftCount(size_t n) {
  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
}

template <typename T>
T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
  constexpr size_t size = sizeof(T);

  constexpr bool is_pow2 = (size & (size - 1)) == 0;
  constexpr size_t bits = ShiftCount(size);
  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");

  const size_t bytes = is_pow2 ? items << bits : items * size;
  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
  if (check != items) {
    return nullptr;  // overflowed
  }
  return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
}

}  // namespace detail

// Aligned memory equivalent of make_unique<T[]> for array types using the
// custom allocators alloc/free. This function calls the constructor with the
// passed Args... on every created item. The destructor of each element will be
// called when the AlignedUniquePtr is destroyed.
template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
    size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
  T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
  if (ptr != nullptr) {
    for (size_t i = 0; i < items; i++) {
      new (ptr + i) T(std::forward<Args>(args)...);
    }
  }
  return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
}

template <typename T, typename... Args>
AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
  return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
      items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
}

// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
// but for aligned memory.
class AlignedFreer {
 public:
  // Pass address of this to ctor to skip deleting externally-owned memory.
  static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}

  AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
  AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}

  template <typename T>
  void operator()(T* aligned_pointer) const {
    // TODO(deymo): assert that we are using a POD type T.
    FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
  }

 private:
  FreePtr free_;
  void* opaque_ptr_;
};

// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
// data use AlignedUniquePtr.
template <typename T>
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;

// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
// Upon destruction of the unique_ptr the aligned array will be freed.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
                                          FreePtr free, void* opaque) {
  return AlignedFreeUniquePtr<T[]>(
      detail::AllocateAlignedItems<T>(items, alloc, opaque),
      AlignedFreer(free, opaque));
}

// Same as previous AllocateAligned(), using default allocate/free functions.
template <typename T>
AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
  return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
}

}  // namespace hwy
#endif  // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
// Copyright 2020 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Main header required before using vector types.

// IWYU pragma: begin_exports
#include "hwy/base.h"
#include "hwy/detect_compiler_arch.h"
#include "hwy/highway_export.h"
#include "hwy/targets.h"
// IWYU pragma: end_exports

// This include guard is checked by foreach_target, so avoid the usual _H_
// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
// after/outside this include guard.
#ifndef HWY_HIGHWAY_INCLUDED
#define HWY_HIGHWAY_INCLUDED

namespace hwy {

// API version (https://semver.org/); keep in sync with CMakeLists.txt.
#define HWY_MAJOR 1
#define HWY_MINOR 0
#define HWY_PATCH 7

//------------------------------------------------------------------------------
// Shorthand for tags (defined in shared-inl.h) used to select overloads.
// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
// HWY_CAPPED(T, N).

// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
// registers in the group, and is ignored on targets that do not support groups.
#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
#define HWY_FULL2(T, LMUL) \
  hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
// Workaround for MSVC grouping __VA_ARGS__ into a single argument
#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
// Trailing comma avoids -pedantic false alarm
#define HWY_CHOOSE_FULL(...) \
  HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)

// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>

//------------------------------------------------------------------------------
// Export user functions for static/dynamic dispatch

// Evaluates to 0 inside a translation unit if it is generating anything but the
// static target (the last one if multiple targets are enabled). Used to prevent
// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
// compile once anyway, so this is 1 unless it is or has been included.
#ifndef HWY_ONCE
#define HWY_ONCE 1
#endif

// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
// defined), and can be used to deduce the return type of Choose*.
#if HWY_STATIC_TARGET == HWY_SCALAR
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_EMU128
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_RVV
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_WASM
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_NEON_WITHOUT_AES
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON_WITHOUT_AES::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_NEON
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE_256
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SVE2_128
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC8
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC9
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC9::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_PPC10
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC10::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSE2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSSE3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_SSE4
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX2
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3_DL
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3_ZEN4
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_ZEN4::FUNC_NAME
#elif HWY_STATIC_TARGET == HWY_AVX3_SPR
#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_SPR::FUNC_NAME
#endif

// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
// nullptr is that target was not compiled.
#if HWY_TARGETS & HWY_EMU128
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
#elif HWY_TARGETS & HWY_SCALAR
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
#else
// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
#endif

#if HWY_TARGETS & HWY_WASM_EMU256
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
#else
#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_WASM
#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
#else
#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_RVV
#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
#else
#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_NEON_WITHOUT_AES
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) &N_NEON_WITHOUT_AES::FUNC_NAME
#else
#define HWY_CHOOSE_NEON_WITHOUT_AES(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_NEON
#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
#else
#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_SVE
#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
#else
#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_SVE2
#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
#else
#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_SVE_256
#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
#else
#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_SVE2_128
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
#else
#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_PPC8
#define HWY_CHOOSE_PPC8(FUNC_NAME) &N_PPC8::FUNC_NAME
#else
#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_PPC9
#define HWY_CHOOSE_PPC9(FUNC_NAME) &N_PPC9::FUNC_NAME
#else
#define HWY_CHOOSE_PPC9(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_PPC10
#define HWY_CHOOSE_PPC10(FUNC_NAME) &N_PPC10::FUNC_NAME
#else
#define HWY_CHOOSE_PPC10(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_SSE2
#define HWY_CHOOSE_SSE2(FUNC_NAME) &N_SSE2::FUNC_NAME
#else
#define HWY_CHOOSE_SSE2(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_SSSE3
#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
#else
#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_SSE4
#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
#else
#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_AVX2
#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
#else
#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_AVX3
#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_AVX3_DL
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_AVX3_ZEN4
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) &N_AVX3_ZEN4::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3_ZEN4(FUNC_NAME) nullptr
#endif

#if HWY_TARGETS & HWY_AVX3_SPR
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) &N_AVX3_SPR::FUNC_NAME
#else
#define HWY_CHOOSE_AVX3_SPR(FUNC_NAME) nullptr
#endif

// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
// apparently cannot be an array. Use a function pointer instead, which has the
// disadvantage that we call the static (not best) target on the first call to
// any HWY_DYNAMIC_DISPATCH.
#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
#define HWY_DISPATCH_WORKAROUND 1
#else
#define HWY_DISPATCH_WORKAROUND 0
#endif

// Provides a static member function which is what is called during the first
// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
// this function are the first entry in the tables created by HWY_EXPORT.
template <typename RetType, typename... Args>
struct FunctionCache {
 public:
  typedef RetType(FunctionType)(Args...);

#if HWY_DISPATCH_WORKAROUND
  template <FunctionType* const func>
  static RetType ChooseAndCall(Args... args) {
    ChosenTarget& chosen_target = GetChosenTarget();
    chosen_target.Update(SupportedTargets());
    return (*func)(args...);
  }
#else
  // A template function that when instantiated has the same signature as the
  // function being called. This function initializes the bit array of targets
  // supported by the current CPU and then calls the appropriate entry within
  // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
  // exported functions, even those defined by different translation units,
  // will dispatch directly to the best available target.
  template <FunctionType* const table[]>
  static RetType ChooseAndCall(Args... args) {
    ChosenTarget& chosen_target = GetChosenTarget();
    chosen_target.Update(SupportedTargets());
    return (table[chosen_target.GetIndex()])(args...);
  }
#endif  // HWY_DISPATCH_WORKAROUND
};

// Used to deduce the template parameters RetType and Args from a function.
template <typename RetType, typename... Args>
FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
  return FunctionCache<RetType, Args...>();
}

#define HWY_DISPATCH_TABLE(FUNC_NAME) \
  HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)

// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
// static array must be defined at the same namespace level as the function
// it is exporting.
// After being exported, it can be called from other parts of the same source
// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
// like in the following example:
//
//   #include "hwy/highway.h"
//   HWY_BEFORE_NAMESPACE();
//   namespace skeleton {
//   namespace HWY_NAMESPACE {
//
//   void MyFunction(int a, char b, const char* c) { ... }
//
//   // NOLINTNEXTLINE(google-readability-namespace-comments)
//   }  // namespace HWY_NAMESPACE
//   }  // namespace skeleton
//   HWY_AFTER_NAMESPACE();
//
//   namespace skeleton {
//   HWY_EXPORT(MyFunction);  // Defines the dispatch table in this scope.
//
//   void MyFunction(int a, char b, const char* c) {
//     return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
//   }
//   }  // namespace skeleton
//

#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)

// Simplified version for IDE or the dynamic dispatch case with only one target.
// This case still uses a table, although of a single element, to provide the
// same compile error conditions as with the dynamic dispatch case when multiple
// targets are being compiled.
#define HWY_EXPORT(FUNC_NAME)                                             \
  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
  HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
#define HWY_DYNAMIC_POINTER(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)

#else

// Simplified version for MSVC 2017: function pointer instead of table.
#if HWY_DISPATCH_WORKAROUND

#define HWY_EXPORT(FUNC_NAME)                                                \
  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
      /* The first entry in the table initializes the global cache and       \
       * calls the function from HWY_STATIC_TARGET. */                       \
      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
          FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>,      \
      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
  }

#else

// Dynamic dispatch case with one entry per dynamic target plus the fallback
// target and the initialization wrapper.
#define HWY_EXPORT(FUNC_NAME)                                                \
  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
      /* The first entry in the table initializes the global cache and       \
       * calls the appropriate function. */                                  \
      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
          FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,        \
      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
  }

#endif  // HWY_DISPATCH_WORKAROUND

#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
  (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
#define HWY_DYNAMIC_POINTER(FUNC_NAME) \
  (HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])

#endif  // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)

// DEPRECATED names; please use HWY_HAVE_* instead.
#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64

}  // namespace hwy

#endif  // HWY_HIGHWAY_INCLUDED

//------------------------------------------------------------------------------

// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
// to include them once per target, which is ensured by the toggle check.
// Because ops/*.h are included under it, they do not need their own guard.
#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
#ifdef HWY_HIGHWAY_PER_TARGET
#undef HWY_HIGHWAY_PER_TARGET
#else
#define HWY_HIGHWAY_PER_TARGET
#endif

// These define ops inside namespace hwy::HWY_NAMESPACE.
#if HWY_TARGET == HWY_SSE2 || HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
#include "hwy/ops/x86_128-inl.h"
#elif HWY_TARGET == HWY_AVX2
#include "hwy/ops/x86_256-inl.h"
#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL || \
    HWY_TARGET == HWY_AVX3_ZEN4 || HWY_TARGET == HWY_AVX3_SPR
#include "hwy/ops/x86_512-inl.h"
#elif HWY_TARGET == HWY_PPC8 || HWY_TARGET == HWY_PPC9 || \
    HWY_TARGET == HWY_PPC10
#include "hwy/ops/ppc_vsx-inl.h"
#elif HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES
#include "hwy/ops/arm_neon-inl.h"
#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
#include "hwy/ops/arm_sve-inl.h"
#elif HWY_TARGET == HWY_WASM_EMU256
#include "hwy/ops/wasm_256-inl.h"
#elif HWY_TARGET == HWY_WASM
#include "hwy/ops/wasm_128-inl.h"
#elif HWY_TARGET == HWY_RVV
#include "hwy/ops/rvv-inl.h"
#elif HWY_TARGET == HWY_EMU128
#include "hwy/ops/emu128-inl.h"
#elif HWY_TARGET == HWY_SCALAR
#include "hwy/ops/scalar-inl.h"
#else
#pragma message("HWY_TARGET does not match any known target")
#endif  // HWY_TARGET

#include "hwy/ops/generic_ops-inl.h"

#endif  // HWY_HIGHWAY_PER_TARGET
// Copyright 2022 Google LLC
// SPDX-License-Identifier: Apache-2.0
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef HWY_PRINT_H_
#define HWY_PRINT_H_

// Helpers for printing vector lanes.

#include <stddef.h>
#include <stdio.h>

#include "hwy/base.h"
#include "hwy/highway_export.h"

namespace hwy {

namespace detail {

// For implementing value comparisons etc. as type-erased functions to reduce
// template bloat.
struct TypeInfo {
  size_t sizeof_t;
  bool is_float;
  bool is_signed;
  bool is_bf16;
};

template <typename T>
HWY_INLINE TypeInfo MakeTypeInfo() {
  TypeInfo info;
  info.sizeof_t = sizeof(T);
  info.is_float = IsFloat<T>();
  info.is_signed = IsSigned<T>();
  info.is_bf16 = IsSame<T, bfloat16_t>();
  return info;
}

HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
                            char* string100);

HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
                              const void* array_void, size_t N,
                              size_t lane_u = 0, size_t max_lanes = 7);

}  // namespace detail

template <typename T>
HWY_NOINLINE void PrintValue(T value) {
  char str[100];
  detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
  fprintf(stderr, "%s,", str);
}

template <typename T>
HWY_NOINLINE void PrintArray(const T* value, size_t count) {
  detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
                     count);
}

}  // namespace hwy

#endif  // HWY_PRINT_H_
namespace hwy {
namespace N_NEON_WITHOUT_AES {
template <class D, class V, class M = MFromD<D>> V Atan2(D d, V y, V x) {
  V kHalf = Set(d, 0.5), kPi = Set(d, 3.14159265358979323846264), kPi2 = kHalf,
    k0 = Zero(d);
  M y_0, x_neg = Lt(x, k0), y_inf = IsInf(y), x_inf = IsInf(x), nan;
  V if_xneg_pi = IfThenElseZero(x_neg, kPi);
  V if_yinf = IfThenElse(x_inf, Add(kPi2, if_xneg_pi), kPi);
#if 1
  V t0 = Zero(d);
  V t = IfThenElse(y_inf, if_yinf, t0);
  (void)t;
#else
  V t = IfThenElse(y_inf, if_yinf, t);
#endif
  return CopySign(t, y);
}
} // namespace N_NEON_WITHOUT_AES
} // namespace hwy
namespace hwy {
namespace detail {
bool IsEqual(const TypeInfo &, const void *, const void *);
void PrintMismatchAndAbort(const TypeInfo &, const void *, const void *,
                           const char *, const char *, int, size_t, size_t = 1);
} // namespace detail
char AssertEqual_target_name;
template <typename T>
void AssertEqual(T expected, T actual, const char *filename, int line) {
  auto info = detail::MakeTypeInfo<T>();
  if (!IsEqual(info, &expected, &actual))
    PrintMismatchAndAbort(info, &expected, &actual, &AssertEqual_target_name,
                          filename, line, 0);
}
namespace N_NEON_WITHOUT_AES {
template <class D>
void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
                     const char *filename, int line) {
  size_t N = Lanes(d);
  Rebind<uint8_t, D> d8;
  size_t N8 = Lanes(d8);
  auto bits_a = AllocateAligned<uint8_t> HWY_MAX(size_t{}, N8);
  auto bits_b = AllocateAligned<uint8_t>(N8);
  size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get());
  size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get());
  AssertEqual(num_bytes_a, num_bytes_b, filename, line);
  size_t remainder = N;
  if (remainder) {
  }
}
#define HWY_ASSERT_MASK_EQ(d, expected, actual)                                \
  AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)
struct TestAtan2 {
  template <typename T, class D> void operator()(T t, D d) {
    size_t N = Lanes(d);
    size_t padded;
    AlignedFreeUniquePtr<T[]> in_y, in_x, expected;
    Atan2TestCases(t, d, padded, in_y, in_x, expected);
    Vec<D> tolerance = Set(d, T());
    for (size_t i = 0; i < padded; i += N) {
      Vec<D> y = Load(d, &in_y[i]);
      Vec<D> x = Load(d, &in_x[i]);
      Vec<D> actual = Atan2(d, y, x);
      Vec<D> vexpected = Load(d, &expected[i]);
      Mask<D> exp_nan = IsNaN(vexpected);
      Mask<D> act_nan = IsNaN(actual);
      HWY_ASSERT_MASK_EQ(d, exp_nan, act_nan);
      Mask<D> ge = Ge(actual, Sub(vexpected, tolerance));
      Mask<D> le = Le(actual, tolerance);
      Mask<D> ok = And(le, ge);
      if (!AllTrue(d, ok))
        HWY_ASSERT(0);
    }
  }
};
template <typename T, size_t kMul, size_t kMinArg, int kPow2 = 0>
struct ForeachCappedR {
  static void Do(size_t min_lanes, size_t max_lanes) {
    CappedTag<T, kMinArg> d;
    TestAtan2()(double(), d);
    ForeachCappedR<double, kMul / 2, kPow2>::Do(min_lanes, max_lanes);
  }
};
template <typename T, size_t kMinArg, int kPow2>
struct ForeachCappedR<T, 0, kMinArg, kPow2> {
  static void Do(size_t, size_t) {}
};
struct ForPartialVectors {
  void operator()(double t) {
    (void)t;
    ForeachCappedR<double, 1, 1>::Do(1, 1);
  }
};
template <typename T, class D>
void Atan2TestCases(T, D d, size_t &padded, AlignedFreeUniquePtr<T[]> &out_y,
                    AlignedFreeUniquePtr<T[]> &out_x,
                    AlignedFreeUniquePtr<T[]> &out_expected) {
  struct YX {
    T y;
    T x;
    T expected;
  };
  T pos(1E5), neg(1E7), n0(0.0), inf = GetLane(Inf(d)), nan = 0,
                                 pi(3.141592653589793238);
  YX test_cases[]{{-inf, inf, -pi / 4}, {neg, inf, n0}, {pos, nan, nan}};
  size_t kNumTestCases = sizeof(0);
  size_t N = Lanes(d);
  padded = RoundUpTo(kNumTestCases, N);
  out_y = AllocateAligned<T>(padded);
  out_x = AllocateAligned<T>(padded);
  out_expected = AllocateAligned<T>(padded);
  size_t i = 0;
  out_y[i] = test_cases[i].y;
  out_x[i] = test_cases[i].x;
  out_expected[i] = test_cases[i].expected;
}
void TestAllAtan2() {
  ForPartialVectors func;
  func(double());
}
} // namespace N_NEON_WITHOUT_AES
} // namespace hwy
int main() { hwy::N_NEON_WITHOUT_AES::TestAllAtan2(); }

Reply via email to