Add some Intel intrinsics for ChaCha.

_mm_load1_ps
_mm_loadu_si128
_mm_movelh_ps
_mm_slli_epi32
_mm_storeu_si128
_mm_unpackhi_epi32
_mm_unpacklo_epi32
This commit is contained in:
riastradh 2020-07-25 22:45:10 +00:00
parent e9ec169980
commit 82803013ac
1 changed files with 67 additions and 3 deletions

View File

@ -1,4 +1,4 @@
/* $NetBSD: immintrin.h,v 1.4 2020/07/25 22:44:32 riastradh Exp $ */
/* $NetBSD: immintrin.h,v 1.5 2020/07/25 22:45:10 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
@ -102,6 +102,20 @@ _mm_add_epi32(__m128i __a, __m128i __b)
(__v16qi)(__m128i)(lo), (int)(bytes))
#endif
_INTRINSATTR
static __inline __m128
_mm_load1_ps(const float *__p)
{
return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
}
_INTRINSATTR
static __inline __m128i
_mm_loadu_si128(const __m128i_u *__p)
{
return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
}
_INTRINSATTR
static __inline __m128i
_mm_loadu_si32(const void *__p)
@ -132,8 +146,18 @@ _mm_movehl_ps(__m128 __v0, __m128 __v1)
#if defined(__GNUC__) && !defined(__clang__)
return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
#elif defined(__clang__)
return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1,
6, 7, 2, 3);
return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
#endif
}
_INTRINSATTR
static __inline __m128
_mm_movelh_ps(__m128 __v0, __m128 __v1)
{
#if defined(__GNUC__) && !defined(__clang__)
return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
#elif defined(__clang__)
return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
#endif
}
@ -203,6 +227,13 @@ _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \
(__v4sf)(__m128)(y), (int)(m)) \
_INTRINSATTR
static __inline __m128i
_mm_slli_epi32(__m128i __v, uint8_t __bits)
{
return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
}
_INTRINSATTR
static __inline __m128i
_mm_slli_epi64(__m128i __v, uint8_t __bits)
@ -243,6 +274,13 @@ _mm_srli_epi64(__m128i __v, uint8_t __bits)
(int)(bytes));
#endif
_INTRINSATTR
static __inline void
_mm_storeu_si128(__m128i_u *__p, __m128i __v)
{
((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
}
_INTRINSATTR
static __inline void
_mm_storeu_si32(void *__p, __m128i __v)
@ -271,6 +309,32 @@ _mm_sub_epi64(__m128i __x, __m128i __y)
return (__m128i)((__v2du)__x - (__v2du)__y);
}
_INTRINSATTR
static __inline __m128i
_mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
{
#if defined(__GNUC__) && !defined(__clang__)
return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
(__v4si)__hi);
#elif defined(__clang__)
return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
2,6,3,7);
#endif
}
_INTRINSATTR
static __inline __m128i
_mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
{
#if defined(__GNUC__) && !defined(__clang__)
return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
(__v4si)__hi);
#elif defined(__clang__)
return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
0,4,1,5);
#endif
}
_INTRINSATTR
static __inline __m128i
_mm_unpacklo_epi64(__m128i __lo, __m128i __hi)