https://github.com/bassiounix updated https://github.com/llvm/llvm-project/pull/175038
>From 0400f21353ef23e82e6515ead54f7e8ff2088a7c Mon Sep 17 00:00:00 2001 From: bassiounix <[email protected]> Date: Thu, 8 Jan 2026 19:44:47 +0200 Subject: [PATCH] [libc][wctype] Upstream immintrin storage from PtrHash-cc prototype to LLVM libc --- .../wctype/conversion/random/CMakeLists.txt | 22 ++ .../__support/wctype/conversion/random/imm.h | 268 ++++++++++++++ .../wctype/conversion/random/vec128_storage.h | 76 ++++ .../conversion/random/vec256_storage.cpp | 60 +++ .../wctype/conversion/random/vec256_storage.h | 63 ++++ .../conversion/random/vec512_storage.cpp | 342 ++++++++++++++++++ .../wctype/conversion/random/vec512_storage.h | 82 +++++ 7 files changed, 913 insertions(+) create mode 100644 libc/src/__support/wctype/conversion/random/imm.h create mode 100644 libc/src/__support/wctype/conversion/random/vec128_storage.h create mode 100644 libc/src/__support/wctype/conversion/random/vec256_storage.cpp create mode 100644 libc/src/__support/wctype/conversion/random/vec256_storage.h create mode 100644 libc/src/__support/wctype/conversion/random/vec512_storage.cpp create mode 100644 libc/src/__support/wctype/conversion/random/vec512_storage.h diff --git a/libc/src/__support/wctype/conversion/random/CMakeLists.txt b/libc/src/__support/wctype/conversion/random/CMakeLists.txt index dd9d577e4cd8b..a7cab77016033 100644 --- a/libc/src/__support/wctype/conversion/random/CMakeLists.txt +++ b/libc/src/__support/wctype/conversion/random/CMakeLists.txt @@ -5,3 +5,25 @@ add_header_library( DEPENDS libc.src.__support.wctype.conversion.utils.utils ) + +add_header_library( + vec128_storage + HDRS + vec128_storage.h + DEPENDS + libc.src.__support.CPP.array + libc.src.__support.wctype.conversion.utils.slice +) + +add_object_library( + vector_storage + HDRS + imm.h + vec256_storage.h + vec512_storage.h + SRCS + vec512_storage.cpp + vec256_storage.cpp + DEPENDS + .vec128_storage +) diff --git a/libc/src/__support/wctype/conversion/random/imm.h b/libc/src/__support/wctype/conversion/random/imm.h new file mode 100644 index 0000000000000..e67cb1b5774b9 --- /dev/null +++ b/libc/src/__support/wctype/conversion/random/imm.h @@ -0,0 +1,268 @@ +//===-- Portable subset of <immintrin.h> ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Only little-endian is supported (runtime code is not affected by this). + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H +#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H + +#include "vec512_storage.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace wctype_internal { + +namespace random { + +namespace immintrin { + +using random::vector_storage::vec128_storage; +using random::vector_storage::vec256_storage; +using random::vector_storage::vec512_storage; + +LIBC_INLINE static constexpr vec256_storage +mm256_add_epi32(const vec256_storage &a, const vec256_storage &b) { + vec256_storage r{{}}; + for (int i = 0; i < 8; ++i) { + r.u32x8[i] = a.u32x8[i] + b.u32x8[i]; // modulo 2^32 + } + return r; +} + +LIBC_INLINE static constexpr vec512_storage +mm256_add_epi32(const vec512_storage &a, const vec512_storage &b) { + vec512_storage r{{}}; + for (int i = 0; i < 16; ++i) { + r.u32x16[i] = a.u32x16[i] + b.u32x16[i]; // modulo 2^32 + } + return r; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_xor_si256(const cpp::array<uint32_t, 8> &a, + const cpp::array<uint32_t, 8> &b) { + cpp::array<uint32_t, 8> r{}; + for (int i = 0; i < 8; ++i) { + r[i] = a[i] ^ b[i]; + } + return r; +} + +LIBC_INLINE static constexpr vec512_storage +mm256_xor_si256(const cpp::array<uint32_t, 16> &a, + const cpp::array<uint32_t, 16> &b) { + vec512_storage r{.u32x16 = {}}; + for (int i = 0; i < 16; ++i) { + r.u32x16[i] = a[i] ^ b[i]; + } + return r; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_shuffle_epi8(const vec256_storage &a, const vec256_storage &b) { + vec256_storage r{{}}; + for (size_t k = 0; k < 8; k++) { + r.u32x8[k] = 0; + } + + // Helper for 128-bit lane (16 bytes) + auto shuffle_128 = [](const uint32_t *src, const uint32_t *ctrl, + uint32_t *dst) { + // dst must be zero-initialized by caller + for (int i = 0; i < 16; ++i) { + uint8_t c = (ctrl[i / 4] >> ((i % 4) * 8)) & 0xFF; + + if (c & 0x80) { + // zero byte → already zero + continue; + } + + int k = c & 0x0F; + uint8_t byte = (src[k / 4] >> ((k % 4) * 8)) & 0xFF; + + dst[i / 4] |= static_cast<uint32_t>(byte) << ((i % 4) * 8); + } + }; + + // Shuffle lower 128-bit lane + shuffle_128(&a.u32x8[0], &b.u32x8[0], &r.u32x8[0]); + // Shuffle upper 128-bit lane + shuffle_128(&a.u32x8[4], &b.u32x8[4], &r.u32x8[4]); + + return r; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_set_epi64x(long long a, long long b, long long c, long long d) { + vec256_storage v{{}}; + + // Lower 128-bit lane (d, c) + v.u32x8[0] = static_cast<uint32_t>(d); // d[31:0] + v.u32x8[1] = static_cast<uint32_t>(d >> 32); // d[63:32] + v.u32x8[2] = static_cast<uint32_t>(c); // c[31:0] + v.u32x8[3] = static_cast<uint32_t>(c >> 32); // c[63:32] + + // Upper 128-bit lane (b, a) + v.u32x8[4] = static_cast<uint32_t>(b); // b[31:0] + v.u32x8[5] = static_cast<uint32_t>(b >> 32); // b[63:32] + v.u32x8[6] = static_cast<uint32_t>(a); // a[31:0] + v.u32x8[7] = static_cast<uint32_t>(a >> 32); // a[63:32] + + return v; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_or_si256(const vec256_storage &a, const vec256_storage &b) { + vec256_storage r{{}}; + for (int i = 0; i < 8; ++i) { + r.u32x8[i] = a.u32x8[i] | b.u32x8[i]; + } + return r; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_srli_epi32(const vec256_storage &a, int count) { + vec256_storage r{{}}; + + // Cap the shift count at 31, as larger shifts produce zero + const int c = count & 0x1F; + + for (int i = 0; i < 8; ++i) { + r.u32x8[i] = a.u32x8[i] >> c; + } + + return r; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_slli_epi32(const vec256_storage &a, int count) { + vec256_storage r{{}}; + + // Cap the shift count at 31, as larger shifts produce zero + const int c = count & 0x1F; + + for (int i = 0; i < 8; ++i) { + r.u32x8[i] = a.u32x8[i] << c; + } + + return r; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_permute2x128_si256(const vec256_storage &V1, const vec256_storage &V2, + int M) { + vec256_storage r{{}}; + + // For each 128-bit destination half + for (int half = 0; half < 2; ++half) { + int control = (M >> (half * 4)) & 0xF; + int dst_base = half * 4; + + if (control & 0x8) { + // bit 3 set → zero this 128-bit half + for (int i = 0; i < 4; ++i) { + r.u32x8[dst_base + i] = 0; + } + } else { + // bits [1:0] select source half + const vec256_storage *src{}; + int src_base{}; + + switch (control & 0x3) { + case 0: // V1 lower + src = &V1; + src_base = 0; + break; + case 1: // V1 upper + src = &V1; + src_base = 4; + break; + case 2: // V2 lower + src = &V2; + src_base = 0; + break; + case 3: // V2 upper + src = &V2; + src_base = 4; + break; + } + + for (int i = 0; i < 4; ++i) { + r.u32x8[dst_base + i] = src->u32x8[src_base + i]; + } + } + } + + return r; +} + +// a_lo and a_hi are each 128-bit vectors represented as 4 x 32-bit integers +LIBC_INLINE static constexpr vec256_storage +mm256_setr_m128i(const vec128_storage &lo, const vec128_storage &hi) { + return vec256_storage{{ + lo.u32x4[0], + lo.u32x4[1], + lo.u32x4[2], + lo.u32x4[3], + hi.u32x4[0], + hi.u32x4[1], + hi.u32x4[2], + hi.u32x4[3], + }}; +} + +LIBC_INLINE static constexpr vec256_storage +mm256_shuffle_epi32(vec256_storage a, int imm) { + vec256_storage r{{}}; + + // lower half (elements 0..3) + for (int i = 0; i < 4; ++i) { + int src = (imm >> (2 * i)) & 0x3; + r.u32x8[i] = a.u32x8[src]; + } + + // upper half (elements 4..7) + for (int i = 0; i < 4; ++i) { + int src = (imm >> (2 * i)) & 0x3; + r.u32x8[4 + i] = a.u32x8[4 + src]; + } + + return r; +} + +LIBC_INLINE static constexpr vec128_storage +mm256_extracti128_si256(const vec256_storage &V, int M) { + const int base = (M & 1) * 4; + return {{V.u32x8[base + 0], V.u32x8[base + 1], V.u32x8[base + 2], + V.u32x8[base + 3]}}; +} + +LIBC_INLINE static constexpr vec128_storage +mm_add_epi64(const cpp::array<uint32_t, 4> &a, + const cpp::array<uint32_t, 4> &b) { + return {cpp::array<uint32_t, 4>{ + a[0] + b[0], + a[1] + b[1], + a[2] + b[2], + a[3] + b[3], + }}; +} + +LIBC_INLINE static constexpr cpp::array<uint32_t, 4> +add_epi64(const cpp::array<uint32_t, 4> &a, const cpp::array<uint32_t, 4> &b) { + return {a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]}; +} + +} // namespace immintrin + +} // namespace random + +} // namespace wctype_internal + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_IMM_H diff --git a/libc/src/__support/wctype/conversion/random/vec128_storage.h b/libc/src/__support/wctype/conversion/random/vec128_storage.h new file mode 100644 index 0000000000000..1416fe5b9a60a --- /dev/null +++ b/libc/src/__support/wctype/conversion/random/vec128_storage.h @@ -0,0 +1,76 @@ +//===-- 128-bit storage for StdRng - wctype conversion ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This is a portable implementation of a 128-bit vector storage implemented +// with static arrays, parallel of <immintrin.h>'s `__m128i` which works with +// `constexpr` code. +// Only little-endian is supported (runtime code is not affected by this). + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H +#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H + +#include "src/__support/CPP/array.h" +#include "src/__support/wctype/conversion/utils/slice.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace wctype_internal { + +namespace random { + +namespace vector_storage { + +union vec128_storage { + mutable cpp::array<uint32_t, 4> u32x4; + + LIBC_INLINE constexpr vec128_storage(cpp::array<uint32_t, 4> &&x) + : u32x4(x) {} + LIBC_INLINE constexpr vec128_storage(cpp::array<uint32_t, 4> &x) : u32x4(x) {} + LIBC_INLINE constexpr vec128_storage() : u32x4() {} + + LIBC_INLINE constexpr operator cpp::array<uint32_t, 4>() const { + return this->u32x4; + } + + LIBC_INLINE constexpr cpp::array<uint32_t, 4> to_lanes() const { + return this->u32x4; + } + + LIBC_INLINE static constexpr vec128_storage + from_lanes(cpp::array<uint32_t, 4> &&xs) { + return vec128_storage(xs); + } + + LIBC_INLINE static constexpr auto from_lanes(cpp::array<uint64_t, 2> &&xs) { + cpp::array<uint32_t, 4> x = { + static_cast<uint32_t>(xs[0]), static_cast<uint32_t>(xs[0] >> 32), + static_cast<uint32_t>(xs[1]), static_cast<uint32_t>(xs[1] >> 32)}; + return vec128_storage(x); + } + + LIBC_INLINE static constexpr auto + read_le(conversion_utils::Slice<uint8_t> x) { + LIBC_ASSERT(x.size() == 16); + vec128_storage v = cpp::array<uint32_t, 4>{0}; + uint32_t *dst = v.u32x4.data(); + uint8_t *src = x.data(); + for (uint8_t i = 0; i < 4; ++i) + dst[i] = src[i * 4] | (src[i * 4 + 1] << 8) | (src[i * 4 + 2] << 16) | + (src[i * 4 + 3] << 24); + return v; + } +}; + +} // namespace vector_storage + +} // namespace random + +} // namespace wctype_internal + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC128_STORAGE_H diff --git a/libc/src/__support/wctype/conversion/random/vec256_storage.cpp b/libc/src/__support/wctype/conversion/random/vec256_storage.cpp new file mode 100644 index 0000000000000..806a452b99db7 --- /dev/null +++ b/libc/src/__support/wctype/conversion/random/vec256_storage.cpp @@ -0,0 +1,60 @@ +//===-- 256-bit storage implementation --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This is a portable implementation of a 256-bit vector storage implemented +// with static arrays, parallel of <immintrin.h>'s AVX256 which works with +// `constexpr` code. +// Only little-endian is supported (runtime code is not affected by this). + +#include "vec256_storage.h" +#include "imm.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace wctype_internal { + +namespace random { + +namespace vector_storage { + +LIBC_INLINE constexpr vec256_storage +vec256_storage::shuffle_lane_words3012() const { + return immintrin::mm256_shuffle_epi32(*this, 0b0011'1001); +} + +LIBC_INLINE constexpr vec256_storage +vec256_storage::shuffle_lane_words2301() const { + return immintrin::mm256_shuffle_epi32(*this, 0b0100'1110); +} + +LIBC_INLINE constexpr vec256_storage +vec256_storage::shuffle_lane_words1230() const { + return immintrin::mm256_shuffle_epi32(*this, 0b1001'0011); +} + +LIBC_INLINE constexpr vec256_storage vec256_storage::to_lanes() const { + auto lo = immintrin::mm256_extracti128_si256(*this, 0); + auto hi = immintrin::mm256_extracti128_si256(*this, 1); + return vec256_storage{{ + lo.u32x4[0], + lo.u32x4[1], + lo.u32x4[2], + lo.u32x4[3], + hi.u32x4[0], + hi.u32x4[1], + hi.u32x4[2], + hi.u32x4[3], + }}; +} + +} // namespace vector_storage + +} // namespace random + +} // namespace wctype_internal + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wctype/conversion/random/vec256_storage.h b/libc/src/__support/wctype/conversion/random/vec256_storage.h new file mode 100644 index 0000000000000..f3bd16b8731c5 --- /dev/null +++ b/libc/src/__support/wctype/conversion/random/vec256_storage.h @@ -0,0 +1,63 @@ +//===-- 256-bit storage for StdRng - wctype conversion ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This is a portable implementation of a 256-bit vector storage implemented +// with static arrays, parallel of <immintrin.h>'s AVX256 which works with +// `constexpr` code. +// Only little-endian is supported (runtime code is not affected by this). + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H +#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H + +#include "vec128_storage.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace wctype_internal { + +namespace random { + +namespace vector_storage { + +union vec256_storage { + mutable cpp::array<uint32_t, 8> u32x8; + + LIBC_INLINE constexpr operator cpp::array<uint32_t, 8>() const { + return this->u32x8; + } + + LIBC_INLINE constexpr vec256_storage() : u32x8() {} + LIBC_INLINE static constexpr vec256_storage + construct_from_vec128(vec128_storage &&lo, vec128_storage &&hi) { + vec256_storage r{{}}; + for (size_t i = 0; i < 4; i++) { + r.u32x8[i] = lo.u32x4[i]; + } + for (size_t i = 0; i < 4; i++) { + r.u32x8[i + 4] = hi.u32x4[i]; + } + return r; + } + + LIBC_INLINE constexpr vec256_storage(cpp::array<uint32_t, 8> &&x) + : u32x8(x) {} + + LIBC_INLINE constexpr vec256_storage shuffle_lane_words3012() const; + LIBC_INLINE constexpr vec256_storage shuffle_lane_words2301() const; + LIBC_INLINE constexpr vec256_storage shuffle_lane_words1230() const; + LIBC_INLINE constexpr vec256_storage to_lanes() const; +}; + +} // namespace vector_storage + +} // namespace random + +} // namespace wctype_internal + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC256_STORAGE_H diff --git a/libc/src/__support/wctype/conversion/random/vec512_storage.cpp b/libc/src/__support/wctype/conversion/random/vec512_storage.cpp new file mode 100644 index 0000000000000..e239703d08443 --- /dev/null +++ b/libc/src/__support/wctype/conversion/random/vec512_storage.cpp @@ -0,0 +1,342 @@ +//===-- 512-bit storage implementation --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "vec512_storage.h" +#include "imm.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace wctype_internal { + +namespace random { + +namespace vector_storage { + +LIBC_INLINE constexpr vec512_storage +vec512_storage::construct_from_vec256(const vec256_storage &lo, + const vec256_storage &hi) { + vec512_storage r{{}}; + for (size_t i = 0; i < 8; i++) { + r.u32x16[i] = lo.u32x8[i]; + } + for (size_t i = 0; i < 8; i++) { + r.u32x16[8 + i] = hi.u32x8[i]; + } + return r; +} + +LIBC_INLINE constexpr vec512_storage vec512_storage::new128(vec128_storage i, + vec128_storage j, + vec128_storage k, + vec128_storage l) { + vec512_storage r{{}}; + for (size_t a = 0; a < 4; a++) { + r.u32x16[a] = i.u32x4[a]; + r.u32x16[a + 4] = j.u32x4[a]; + r.u32x16[a + 8] = k.u32x4[a]; + r.u32x16[a + 12] = l.u32x4[a]; + } + return r; +} + +LIBC_INLINE constexpr const vec512_storage & +vec512_storage::operator+=(vec512_storage &rhs) const { + this->u32x16 = immintrin::mm256_add_epi32(*this, rhs).u32x16; + return *this; +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::operator+(const vec512_storage &rhs) const { + return immintrin::mm256_add_epi32(*this, rhs); +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::operator^(vec512_storage &rhs) const { + return immintrin::mm256_xor_si256(*this, rhs); +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::rotate_each_word_right16() const { + auto constexpr K0 = 0x0d0c'0f0e'0908'0b0a; + auto constexpr K1 = 0x0504'0706'0100'0302; + + vec256_storage lo{{}}; + vec256_storage hi{{}}; + + for (size_t i = 0; i < 8; i++) { + lo.u32x8[i] = this->u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + hi.u32x8[i] = this->u32x16[8 + i]; + } + lo = immintrin::mm256_shuffle_epi8( + lo, immintrin::mm256_set_epi64x(K0, K1, K0, K1)); + hi = immintrin::mm256_shuffle_epi8( + hi, immintrin::mm256_set_epi64x(K0, K1, K0, K1)); + + vec512_storage ret{{}}; + for (size_t i = 0; i < 8; i++) { + ret.u32x16[i] = lo.u32x8[i]; + } + + for (size_t i = 0; i < 8; i++) { + ret.u32x16[8 + i] = hi.u32x8[i]; + } + + return ret; +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::rotate_each_word_right20() const { + constexpr int32_t I = 20; + + vec256_storage lo{{}}; + vec256_storage hi{{}}; + + for (size_t i = 0; i < 8; i++) { + lo.u32x8[i] = this->u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + hi.u32x8[i] = this->u32x16[8 + i]; + } + + lo = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(lo, I), + immintrin::mm256_slli_epi32(lo, 32 - I)); + hi = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(hi, I), + immintrin::mm256_slli_epi32(hi, 32 - I)); + + vec512_storage r{{}}; + + for (size_t i = 0; i < 8; i++) { + r.u32x16[i] = lo.u32x8[i]; + } + for (size_t i = 0; i < 8; i++) { + r.u32x16[8 + i] = hi.u32x8[i]; + } + + return r; +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::rotate_each_word_right24() const { + auto constexpr K0 = 0x0e0d'0c0f'0a09'080b; + auto constexpr K1 = 0x0605'0407'0201'0003; + + vec256_storage lo{{}}; + vec256_storage hi{{}}; + + for (size_t i = 0; i < 8; i++) { + lo.u32x8[i] = this->u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + hi.u32x8[i] = this->u32x16[8 + i]; + } + + lo = immintrin::mm256_shuffle_epi8( + lo, immintrin::mm256_set_epi64x(K0, K1, K0, K1)); + hi = immintrin::mm256_shuffle_epi8( + hi, immintrin::mm256_set_epi64x(K0, K1, K0, K1)); + + vec512_storage r{{}}; + + for (size_t i = 0; i < 8; i++) { + r.u32x16[i] = lo.u32x8[i]; + } + for (size_t i = 0; i < 8; i++) { + r.u32x16[8 + i] = hi.u32x8[i]; + } + + return r; +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::rotate_each_word_right25() const { + constexpr int32_t I = 25; + vec256_storage lo{{}}; + vec256_storage hi{{}}; + + for (size_t i = 0; i < 8; i++) { + lo.u32x8[i] = this->u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + hi.u32x8[i] = this->u32x16[8 + i]; + } + + lo = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(lo, I), + immintrin::mm256_slli_epi32(lo, 32 - I)); + hi = immintrin::mm256_or_si256(immintrin::mm256_srli_epi32(hi, I), + immintrin::mm256_slli_epi32(hi, 32 - I)); + + vec512_storage r{{}}; + + for (size_t i = 0; i < 8; i++) { + r.u32x16[i] = lo.u32x8[i]; + } + for (size_t i = 0; i < 8; i++) { + r.u32x16[8 + i] = hi.u32x8[i]; + } + + return r; +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::shuffle_lane_words3012() const { + vec256_storage lo{{}}; + vec256_storage hi{{}}; + + for (size_t i = 0; i < 8; i++) { + lo.u32x8[i] = this->u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + hi.u32x8[i] = this->u32x16[8 + i]; + } + lo = lo.shuffle_lane_words3012(); + hi = hi.shuffle_lane_words3012(); + + vec512_storage r{{}}; + + for (size_t i = 0; i < 8; i++) { + r.u32x16[i] = lo.u32x8[i]; + } + for (size_t i = 0; i < 8; i++) { + r.u32x16[8 + i] = hi.u32x8[i]; + } + + return r; +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::shuffle_lane_words2301() const { + vec256_storage lo{{}}; + vec256_storage hi{{}}; + + for (size_t i = 0; i < 8; i++) { + lo.u32x8[i] = this->u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + hi.u32x8[i] = this->u32x16[8 + i]; + } + lo = lo.shuffle_lane_words2301(); + hi = hi.shuffle_lane_words2301(); + + vec512_storage r{{}}; + + for (size_t i = 0; i < 8; i++) { + r.u32x16[i] = lo.u32x8[i]; + } + for (size_t i = 0; i < 8; i++) { + r.u32x16[8 + i] = hi.u32x8[i]; + } + + return r; +} + +LIBC_INLINE constexpr vec512_storage +vec512_storage::shuffle_lane_words1230() const { + vec256_storage lo{{}}; + vec256_storage hi{{}}; + + for (size_t i = 0; i < 8; i++) { + lo.u32x8[i] = this->u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + hi.u32x8[i] = this->u32x16[8 + i]; + } + + lo = lo.shuffle_lane_words1230(); + hi = hi.shuffle_lane_words1230(); + + vec512_storage r{{}}; + + for (size_t i = 0; i < 8; i++) { + r.u32x16[i] = lo.u32x8[i]; + } + for (size_t i = 0; i < 8; i++) { + r.u32x16[8 + i] = hi.u32x8[i]; + } + + return r; +} + +LIBC_INLINE constexpr cpp::array<vec512_storage, 4> +vec512_storage::transpose4(const vec512_storage &a, const vec512_storage &b, + const vec512_storage &c, const vec512_storage &d) { + /* + * a00:a01 a10:a11 + * b00:b01 b10:b11 + * c00:c01 c10:c11 + * d00:d01 d10:d11 + * => + * a00:b00 c00:d00 + * a01:b01 c01:d01 + * a10:b10 c10:d10 + * a11:b11 c11:d11 + */ + vec256_storage a_lo{{}}; + vec256_storage b_lo{{}}; + + for (size_t i = 0; i < 8; i++) { + a_lo.u32x8[i] = a.u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + b_lo.u32x8[i] = b.u32x16[i]; + } + auto const ab00 = immintrin::mm256_permute2x128_si256(a_lo, b_lo, 0x20); + auto const ab01 = immintrin::mm256_permute2x128_si256(a_lo, b_lo, 0x31); + + vec256_storage a_hi{{}}; + vec256_storage b_hi{{}}; + + for (size_t i = 0; i < 8; i++) { + a_hi.u32x8[i] = a.u32x16[8 + i]; + } + for (size_t i = 0; i < 8; i++) { + b_hi.u32x8[i] = b.u32x16[8 + i]; + } + auto const ab10 = immintrin::mm256_permute2x128_si256(a_hi, b_hi, 0x20); + auto const ab11 = immintrin::mm256_permute2x128_si256(a_hi, b_hi, 0x31); + + vec256_storage c_lo{{}}; + vec256_storage d_lo{{}}; + + for (size_t i = 0; i < 8; i++) { + c_lo.u32x8[i] = c.u32x16[i]; + } + for (size_t i = 0; i < 8; i++) { + d_lo.u32x8[i] = d.u32x16[i]; + } + auto const cd00 = immintrin::mm256_permute2x128_si256(c_lo, d_lo, 0x20); + auto const cd01 = immintrin::mm256_permute2x128_si256(c_lo, d_lo, 0x31); + + vec256_storage c_hi{{}}; + vec256_storage d_hi{{}}; + + for (size_t i = 0; i < 8; i++) { + c_hi.u32x8[i] = c.u32x16[8 + i]; + } + for (size_t i = 0; i < 8; i++) { + d_hi.u32x8[i] = d.u32x16[8 + i]; + } + auto const cd10 = immintrin::mm256_permute2x128_si256(c_hi, d_hi, 0x20); + auto const cd11 = immintrin::mm256_permute2x128_si256(c_hi, d_hi, 0x31); + + auto r1 = vec512_storage::construct_from_vec256(ab00, cd00); + auto r2 = vec512_storage::construct_from_vec256(ab01, cd01); + auto r3 = vec512_storage::construct_from_vec256(ab10, cd10); + auto r4 = vec512_storage::construct_from_vec256(ab11, cd11); + + return cpp::array<vec512_storage, 4>{r1, r2, r3, r4}; +} + +} // namespace vector_storage + +} // namespace random + +} // namespace wctype_internal + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wctype/conversion/random/vec512_storage.h b/libc/src/__support/wctype/conversion/random/vec512_storage.h new file mode 100644 index 0000000000000..de80d4d0442d2 --- /dev/null +++ b/libc/src/__support/wctype/conversion/random/vec512_storage.h @@ -0,0 +1,82 @@ +//===-- 512-bit storage for StdRng - wctype conversion ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This is a portable implementation of a 512-bit vector storage implemented +// with static arrays, works with `constexpr` code. +// Only little-endian is supported (runtime code is not affected by this). + +#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H +#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H + +#include "vec256_storage.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace wctype_internal { + +namespace random { + +namespace vector_storage { + +union vec512_storage { + mutable cpp::array<uint32_t, 16> u32x16; + + LIBC_INLINE static constexpr vec512_storage + construct_from_vec256(const vec256_storage &lo, const vec256_storage &hi); + + LIBC_INLINE constexpr operator cpp::array<uint32_t, 16>() const { + return this->u32x16; + } + + LIBC_INLINE static constexpr vec512_storage + new128(cpp::array<uint32_t, 16> &&xs) { + return vec512_storage{xs}; + } + + LIBC_INLINE static constexpr vec512_storage new128(vec128_storage i, + vec128_storage j, + vec128_storage k, + vec128_storage l); + + LIBC_INLINE constexpr vec512_storage unpack() const { return *this; } + + LIBC_INLINE constexpr const vec512_storage & + operator+=(vec512_storage &rhs) const; + + LIBC_INLINE constexpr vec512_storage operator+(const vec512_storage &) const; + LIBC_INLINE constexpr vec512_storage operator^(vec512_storage &rhs) const; + + LIBC_INLINE constexpr vec512_storage rotate_each_word_right16() const; + LIBC_INLINE constexpr vec512_storage rotate_each_word_right20() const; + LIBC_INLINE constexpr vec512_storage rotate_each_word_right24() const; + LIBC_INLINE constexpr vec512_storage rotate_each_word_right25() const; + + LIBC_INLINE constexpr vec512_storage shuffle_lane_words3012() const; + LIBC_INLINE constexpr vec512_storage shuffle_lane_words2301() const; + LIBC_INLINE constexpr vec512_storage shuffle_lane_words1230() const; + + LIBC_INLINE static constexpr cpp::array<vec512_storage, 4> + transpose4(const vec512_storage &a, const vec512_storage &b, + const vec512_storage &c, const vec512_storage &d); + + LIBC_INLINE constexpr conversion_utils::Slice<uint32_t> to_scalars() const { + return conversion_utils::Slice<uint32_t>(this->u32x16.data(), + this->u32x16.size()); + } + + LIBC_INLINE constexpr vec512_storage to_lanes() const { return *this; } +}; + +} // namespace vector_storage + +} // namespace random + +} // namespace wctype_internal + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_WCTYPE_CONVERSION_RANDOM_VEC512_STORAGE_H _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
