一呼百應, "one call, a hundred responses"
Loading...
Searching...
No Matches
simd_data.hpp
Go to the documentation of this file.
1#pragma once
2
11#include <cstdint>
12#include <type_traits>
13#include <immintrin.h>
14#include "attributes/common.hpp"
15#include "numerics.hpp"
16#include "types.hpp"
17
19#ifdef __AVX512F__
20#define IF512(x,y) x
21#define ON512(...) __VA_ARGS__
22#else
23#define IF512(x,y) y
24#define ON512(...)
25#endif
26
27#if defined(__AVX512VL__) && defined(__AVX512FP16__)
28#define ON512FP16(...) __VA_ARGS__
29#define IF512FP16(x,y) x
30#else
31#define ON512FP16(...)
32#define IF512FP16(x,y) y
33#endif
35
36namespace ein {
39
41constexpr size_t max_simd_size
42#ifdef __AVX512F__
43 = 64;
44#else
45 = 32;
46#endif
47
48namespace detail {
49 template <typename T>
51 using type = T;
52 };
53 template <> struct storage_type_impl<bf16> { using type = __bf16; };
54 template <> struct storage_type_impl<fp16> { using type = _Float16; };
55}
56
59template <typename T>
61
62template <typename T>
63concept has_storage_type = requires {
64 typename storage_type<T>;
65 requires sizeof(T) == sizeof(storage_type<T>);
66 requires requires(T t, storage_type<T> s) {
67 std::bit_cast<storage_type<T>>(t);
68 std::bit_cast<T>(s);
69 };
70};
71
72namespace detail {
73 template <typename T> struct has_lifted_operations_impl : std::false_type {};
74 template <> struct has_lifted_operations_impl<bf16> : std::true_type {};
75 template <> struct has_lifted_operations_impl<fp16> : std::true_type {};
76}
77
81template <typename T>
83
85template <typename T, size_t N>
88 && std::is_pod_v<storage_type<T>>
89 && std::is_pod_v<T>
90 && one_of<sizeof(T),1,2,4,8>
91 && one_of<sizeof(T)*N,16,32,64>
92 && sizeof(T)*N <= max_simd_size;
93
94// \brief unadulterated clang/gcc vector extension type
95template <typename T, size_t N>
96requires has_simd_type<T,N>
97using simd_data_t = storage_type<T> __attribute__((__vector_size__(N*sizeof(storage_type<T>)),__aligned__(N*sizeof(storage_type<T>))));
98
100template <typename U, typename T, size_t N>
109
111// \todo only add __fp16 and _Float16 on Sapphire Rapids
112template <typename T>
114
115namespace detail {
116template <size_t N> struct si {};
117template <size_t N> struct ps {};
118template <size_t N> struct pd {};
119
120template <> struct si<128> { using type = __m128i; };
121template <> struct ps<128> { using type = __m128; };
122template <> struct pd<128> { using type = __m128d; };
123
124template <> struct si<256> { using type = __m256i; };
125template <> struct ps<256> { using type = __m256; };
126template <> struct pd<256> { using type = __m256d; };
127
128#ifdef __AVX512F__
129template <> struct si<512> { using type = __m512i; };
130template <> struct ps<512> { using type = __m512; };
131template <> struct pd<512> { using type = __m512d; };
132#endif
133
134#if defined(__AVX512FP16__) && defined(__AVX512VL__)
135template <size_t N> struct ph {};
136template <> struct ph<128> { using type = __m128ph; };
137template <> struct ph<256> { using type = __m256ph; };
138template <> struct ph<512> { using type = __m512ph; };
139#endif
140
141template <typename T, size_t N> struct simd_intrinsic {};
142
143template <size_t N>
144struct simd_intrinsic<float,N> {
145 using type = typename ps<N*sizeof(float)*8>::type;
146};
147
148template <size_t N>
149struct simd_intrinsic<double,N> {
150 using type = typename pd<N*sizeof(double)*8>::type;
151};
152
153#if defined(__AVX512FP16__) && defined(__AVX512VL__)
154template <size_t N>
155struct simd_intrinsic<__fp16,N> {
156 using type = typename ph<N*sizeof(__fp16)*8>::type;
157};
158
159template <size_t N>
160struct simd_intrinsic<_Float16,N> {
161 using type = typename ph<N*sizeof(_Float16)*8>::type;
162};
163
164template <size_t N>
165struct simd_intrinsic<ein::fp16,N> {
166 using type = typename ph<N*sizeof(ein::fp16)*8>::type;
167};
168#endif
169
170template <
171#if defined(__AVX512FP16__) && defined(__AVX512VL__)
172 not_one_of_t<float,double,__fp16,_Float16,ein::fp16> T,
173#else
174 not_one_of_t<float,double> T,
175#endif
176 size_t N
177>
178struct simd_intrinsic<T,N> {
179 using type = typename ps<N*sizeof(T)*8>::type;
180};
181}
182
186template <typename T, size_t N>
187requires has_simd_type<T,N>
189
191// involving \p N values of type \p T at a time. Currently I prefer to use it if available, and if the `_mmaskN` type
192// matches the register size.
193template <typename T, size_t N>
195#if __AVX512F__
196 = ((has_simd_type<T,N>) && (N >= 8));
197#else
198 = false;
199#endif
200
201#ifdef __AVX512F__
202namespace {
203template <size_t N> struct mmask {};
204template <> struct mmask<8> { using type = __mmask8; };
205template <> struct mmask<16> { using type = __mmask16; };
206template <> struct mmask<32> { using type = __mmask32; };
207template <> struct mmask<64> { using type = __mmask64; };
208}
209
212template <size_t N>
215 = typename mmask<N>::type;
216#endif
217
220template <typename T, size_t N>
222using simd_mask_t =
223#ifdef __AVX512F__
224 std::conditional_t<has_mmask<T,N>, mmask<N>, simd_intrinsic_t<T,N>>;
225#else
227#endif
228
230__m128 cast_ps(__m128i a) noexcept { return _mm_castsi128_ps(a); }
231
233__m128 cast_ps(__m128 a) noexcept { return a; }
234
236__m256 cast_ps(__m256i a) noexcept { return _mm256_castsi256_ps(a); }
237
239__m256 cast_ps(__m256 a) noexcept { return a; }
240
242__m128d cast_pd(__m128i a) noexcept { return _mm_castsi128_pd(a); }
243
245__m128d cast_pd(__m128d a) noexcept { return a; }
246
248__m256d cast_pd(__m256i a) noexcept { return _mm256_castsi256_pd(a); }
249
251__m256d cast_pd(__m256d a) noexcept { return a; }
252
254__m128i cast_si(__m128 a) noexcept { return _mm_castps_si128(a); }
255
257__m128i cast_si(__m128d a) noexcept { return _mm_castpd_si128(a); }
258
260__m128i cast_si(__m128i a) noexcept { return a; }
261
263__m256i cast_si(__m256 a) noexcept { return _mm256_castps_si256(a); }
264
266__m256i cast_si(__m256d a) noexcept { return _mm256_castpd_si256(a); }
267
269__m256i cast_si(__m256i a) noexcept { return a; }
270
271#ifdef __AVX512F__
272
274__m512 cast_ps(__m512i a) noexcept { return _mm512_castsi512_ps(a); }
275
277__m512d cast_pd(__m512i a) noexcept { return _mm512_castsi512_pd(a); }
278
280__m512 cast_ps(__m512 a) noexcept { return a; }
281
283__m512d cast_pd(__m512d a) noexcept { return a; }
284
286__m512i cast_si(__m512 a) noexcept { return _mm512_castps_si512(a); }
287
289__m512i cast_si(__m512d a) noexcept { return _mm512_castpd_si512(a); }
290
292__m512i cast_si(__m512i a) noexcept { return a; }
293
294#if defined(__AVX512FP16__) && defined(__AVX512VL__)
295__m128i cast_si(__m128ph a) noexcept { return _mm_castph_si128(a); }
296__m128 cast_ps(__m128ph a) noexcept { return _mm_castph_ps(a); }
297__m128d cast_pd(__m128ph a) noexcept { return _mm_castph_pd(a); }
298__m128ph cast_ph(__m128ph a) noexcept { return a; }
299
300__m256i cast_si(__m256ph a) noexcept { return _mm256_castph_si256(a); }
301__m256 cast_ps(__m256ph a) noexcept { return _mm256_castph_ps(a); }
302__m256d cast_pd(__m256ph a) noexcept { return _mm256_castph_pd(a); }
303__m256ph cast_ph(__m256ph a) noexcept { return a; }
304
305__m512i cast_si(__m512ph a) noexcept { return _mm512_castph_si512(a); }
306__m512 cast_ps(__m512ph a) noexcept { return _mm512_castph_ps(a); }
307__m512d cast_pd(__m512ph a) noexcept { return _mm512_castph_pd(a); }
308__m512ph cast_ph(__m512ph a) noexcept { return a; }
309
310__m128ph cast_ph(__m128 a) noexcept { return _mm_castps_ph(a); }
311__m256ph cast_ph(__m256 a) noexcept { return _mm256_castps_ph(a); }
312__m512ph cast_ph(__m512 a) noexcept { return _mm512_castps_ph(a); }
313
314__m128ph cast_ph(__m128i a) noexcept { return _mm_castsi128_ph(a); }
315__m256ph cast_ph(__m256i a) noexcept { return _mm256_castsi256_ph(a); }
316__m512ph cast_ph(__m512i a) noexcept { return _mm512_castsi512_ph(a); }
317
318__m128ph cast_ph(__m128d a) noexcept { return _mm_castpd_ph(a); }
319__m256ph cast_ph(__m256d a) noexcept { return _mm256_castpd_ph(a); }
320__m512ph cast_ph(__m512d a) noexcept { return _mm512_castpd_ph(a); }
321#endif // __AVX512FP16__ && __AVX512VL__
322
323#endif // __AVX512F__
324
326} // namespace ein
327
328#if defined(EIN_TESTING) || defined(EIN_TESTING_SIMD_DATA)
329#include <string_view>
330#include "types.hpp"
331
332TEMPLATE_TEST_CASE("simd_data","[simd_data]",int8_t,uint8_t,int16_t,uint16_t,int32_t,uint32_t,int64_t,uint64_t,float,double) {
333 using namespace ein;
334
335 constexpr size_t N128 = 16/sizeof(TestType);
336 constexpr size_t N256 = 32/sizeof(TestType);
337#ifdef __AVX512F__
338 constexpr size_t N512 = 64/sizeof(TestType);
339#endif
340 SECTION("has_simd_data") {
341 STATIC_REQUIRE(has_simd_type<TestType,N128>);
342 STATIC_REQUIRE(has_simd_type<TestType,N256>);
343#ifdef __AVX512F__
344 STATIC_REQUIRE(has_simd_type<TestType,N512>);
345#endif
346 }
347 SECTION("simd_data_t") {
348 using d128 = simd_data_t<TestType,N128>;
349 using d256 = simd_data_t<TestType,N256>;
350#ifdef __AVX512F__
351 using d512 = simd_data_t<TestType,N512>;
352#endif
353 SECTION("has the right size") {
354 CHECK(sizeof(d128) == 16);
355 CHECK(sizeof(d256) == 32);
356#ifdef __AVX512F__
357 CHECK(sizeof(d512) == 64);
358#endif
359 }
360 d128 x128{TestType{}};
361 d256 x256{TestType{}};
362#ifdef __AVX512F__
363 d512 x512{TestType{}};
364#endif
365 SECTION("can be indexed at the right type") {
366 STATIC_REQUIRE(std::is_same_v<std::remove_cvref_t<decltype(x128[0])>, storage_type<TestType>>);
367 STATIC_REQUIRE(std::is_same_v<std::remove_cvref_t<decltype(x256[0])>, storage_type<TestType>>);
368#ifdef __AVX512F__
369 STATIC_REQUIRE(std::is_same_v<std::remove_cvref_t<decltype(x512[0])>, storage_type<TestType>>);
370#endif
371 }
372 SECTION("can be indexed with the right value") {
373 CHECK(x128[0] == storage_type<TestType>{});
374 CHECK(x256[0] == storage_type<TestType>{});
375#ifdef __AVX512F__
376 CHECK(x512[0] == storage_type<TestType>{});
377#endif
378 }
379 }
380
381 SECTION("simd_intrinsic_t") {
384#ifdef __AVX512F__
386#endif
387
388 [[maybe_unused]] t128 x128{};
389 [[maybe_unused]] t256 x256{};
390#ifdef __AVX512F__
391 [[maybe_unused]] t512 x512{};
392#endif
393
394// SECTION("cast_ps") {
395// CHECK(sizeof(cast_ps(x128)) == sizeof(x128));
396// CHECK(sizeof(cast_ps(x256)) == sizeof(x256));
397// #ifdef __AVX512F__
398// CHECK(sizeof(cast_ps(x512)) == sizeof(x512));
399// #endif
400// }
401
402// SECTION("cast_si") {
403// CHECK(sizeof(cast_si(x128)) == sizeof(x128));
404// CHECK(sizeof(cast_si(x256)) == sizeof(x256));
405// #ifdef __AVX512F__
406// CHECK(sizeof(cast_si(x512)) == sizeof(x512));
407// #endif
408// }
409
410// SECTION("cast_pd") {
411// CHECK(sizeof(cast_pd(x128)) == sizeof(x128));
412// CHECK(sizeof(cast_pd(x256)) == sizeof(x256));
413// #ifdef __AVX512F__
414// CHECK(sizeof(cast_pd(x512)) == sizeof(x512));
415// #endif
416// }
417 }
418}
419
420#endif
can we convert simd_data_t<U,N> -> simd_data_t<T,N> automatically using gcc vector extensions?
Does this type have operations that semantically correct when lifted to the simd_data_t level?
Definition simd_data.hpp:82
Do we want to use AVX512's notion of an _mmask8, _mmask16, _mmask32, or _mmask64 for masking operatio...
ein::simd_data_t<T,N> is defined
Definition simd_data.hpp:86
type T is one of the candidates
Definition types.hpp:48
N is one of the candidates
Definition numerics.hpp:33
is this type one of the types that is handed well automatically by clang/gcc vector extensions?
#define ein_artificial
[[artificial]].
Definition common.hpp:220
#define ein_inline
inline [[always_inline]]
Definition common.hpp:188
#define ein_nodiscard
C++17 [[nodiscard]].
Definition common.hpp:165
#define ein_const
[[const]] is not const
Definition common.hpp:84
typename ps< N *sizeof(float) *8 >::type type
typename pd< N *sizeof(double) *8 >::type type
typename ps< N *sizeof(T) *8 >::type type
__m128d cast_pd(__m128i a) noexcept
typename mmask< N >::type mmask_t
If AVX512 is enabled returns the type of an n-bit mmask.
__m128 cast_ps(__m128i a) noexcept
typename detail::simd_intrinsic< T, N >::type simd_intrinsic_t
Returns the Intel intrinsic type associated with a simd register full of N values of type T.
storage_type< T > __attribute__((__vector_size__(N *sizeof(storage_type< T >)), __aligned__(N *sizeof(storage_type< T >)))) simd_data_t
Definition simd_data.hpp:97
__m128i cast_si(__m128 a) noexcept
typename detail::storage_type_impl< T >::type storage_type
The type used to store T in a simd_data_t.
Definition simd_data.hpp:60
std::conditional_t< has_mmask< T, N >, mmask< N >, simd_intrinsic_t< T, N > > simd_mask_t
What type of mask should I use?
constexpr size_t max_simd_size
largest simd register width supported on this platform in bytes
Definition simd_data.hpp:43
const string_view type
returns the unmangled name of a the type T
Definition types.hpp:30
Definition cpuid.cpp:16