From 82803013acdc88ec9c0bf68f932aebf566e911c8 Mon Sep 17 00:00:00 2001 From: riastradh Date: Sat, 25 Jul 2020 22:45:10 +0000 Subject: [PATCH] Add some Intel intrinsics for ChaCha. _mm_load1_ps _mm_loadu_si128 _mm_movelh_ps _mm_slli_epi32 _mm_storeu_si128 _mm_unpackhi_epi32 _mm_unpacklo_epi32 --- sys/crypto/aes/arch/x86/immintrin.h | 70 +++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 3 deletions(-) diff --git a/sys/crypto/aes/arch/x86/immintrin.h b/sys/crypto/aes/arch/x86/immintrin.h index a8b5880faa3d..064ef14c8fd4 100644 --- a/sys/crypto/aes/arch/x86/immintrin.h +++ b/sys/crypto/aes/arch/x86/immintrin.h @@ -1,4 +1,4 @@ -/* $NetBSD: immintrin.h,v 1.4 2020/07/25 22:44:32 riastradh Exp $ */ +/* $NetBSD: immintrin.h,v 1.5 2020/07/25 22:45:10 riastradh Exp $ */ /*- * Copyright (c) 2020 The NetBSD Foundation, Inc. @@ -102,6 +102,20 @@ _mm_add_epi32(__m128i __a, __m128i __b) (__v16qi)(__m128i)(lo), (int)(bytes)) #endif +_INTRINSATTR +static __inline __m128 +_mm_load1_ps(const float *__p) +{ + return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p }; +} + +_INTRINSATTR +static __inline __m128i +_mm_loadu_si128(const __m128i_u *__p) +{ + return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v; +} + _INTRINSATTR static __inline __m128i _mm_loadu_si32(const void *__p) @@ -132,8 +146,18 @@ _mm_movehl_ps(__m128 __v0, __m128 __v1) #if defined(__GNUC__) && !defined(__clang__) return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1); #elif defined(__clang__) - return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, - 6, 7, 2, 3); + return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3); +#endif +} + +_INTRINSATTR +static __inline __m128 +_mm_movelh_ps(__m128 __v0, __m128 __v1) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1); +#elif defined(__clang__) + return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5); #endif } @@ -203,6 +227,13 @@ _mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx) (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \ (__v4sf)(__m128)(y), (int)(m)) \ +_INTRINSATTR +static __inline __m128i +_mm_slli_epi32(__m128i __v, uint8_t __bits) +{ + return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits); +} + _INTRINSATTR static __inline __m128i _mm_slli_epi64(__m128i __v, uint8_t __bits) @@ -243,6 +274,13 @@ _mm_srli_epi64(__m128i __v, uint8_t __bits) (int)(bytes)); #endif +_INTRINSATTR +static __inline void +_mm_storeu_si128(__m128i_u *__p, __m128i __v) +{ + ((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v; +} + _INTRINSATTR static __inline void _mm_storeu_si32(void *__p, __m128i __v) @@ -271,6 +309,32 @@ _mm_sub_epi64(__m128i __x, __m128i __y) return (__m128i)((__v2du)__x - (__v2du)__y); } +_INTRINSATTR +static __inline __m128i +_mm_unpackhi_epi32(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo, + (__v4si)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, + 2,6,3,7); +#endif +} + +_INTRINSATTR +static __inline __m128i +_mm_unpacklo_epi32(__m128i __lo, __m128i __hi) +{ +#if defined(__GNUC__) && !defined(__clang__) + return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo, + (__v4si)__hi); +#elif defined(__clang__) + return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi, + 0,4,1,5); +#endif +} + _INTRINSATTR static __inline __m128i _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)