sys/crypto: Introduce arch/{arm,x86} to share common MD headers
Dedup between aes and chacha. No binary changes.
This commit is contained in:
parent
8ee3d6ae37
commit
d754abaff4
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: aes_neon_impl.h,v 1.3 2020/08/08 14:47:01 riastradh Exp $ */
|
||||
/* $NetBSD: aes_neon_impl.h,v 1.4 2023/08/07 01:07:35 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -31,8 +31,8 @@
|
|||
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "arm_neon.h"
|
||||
#include "arm_neon_imm.h"
|
||||
#include <crypto/arch/arm/arm_neon.h>
|
||||
#include <crypto/arch/arm/arm_neon_imm.h>
|
||||
|
||||
#include <crypto/aes/aes.h>
|
||||
#include <crypto/aes/arch/arm/aes_neon.h>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: aes_sse2_impl.h,v 1.2 2020/06/29 23:50:05 riastradh Exp $ */
|
||||
/* $NetBSD: aes_sse2_impl.h,v 1.3 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -33,8 +33,8 @@
|
|||
|
||||
#include <crypto/aes/aes.h>
|
||||
#include <crypto/aes/arch/x86/aes_sse2.h>
|
||||
#include <crypto/aes/arch/x86/immintrin.h>
|
||||
#include <crypto/aes/arch/x86/immintrin_ext.h>
|
||||
#include <crypto/arch/x86/immintrin.h>
|
||||
#include <crypto/arch/x86/immintrin_ext.h>
|
||||
|
||||
void aes_sse2_bitslice_Sbox(__m128i[static 4]);
|
||||
void aes_sse2_bitslice_invSbox(__m128i[static 4]);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: aes_ssse3_impl.h,v 1.1 2020/06/29 23:51:35 riastradh Exp $ */
|
||||
/* $NetBSD: aes_ssse3_impl.h,v 1.2 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -33,8 +33,8 @@
|
|||
|
||||
#include <crypto/aes/aes.h>
|
||||
#include <crypto/aes/arch/x86/aes_ssse3.h>
|
||||
#include <crypto/aes/arch/x86/immintrin.h>
|
||||
#include <crypto/aes/arch/x86/immintrin_ext.h>
|
||||
#include <crypto/arch/x86/immintrin.h>
|
||||
#include <crypto/arch/x86/immintrin_ext.h>
|
||||
|
||||
__m128i aes_ssse3_enc1(const struct aesenc *, __m128i, unsigned);
|
||||
__m128i aes_ssse3_dec1(const struct aesdec *, __m128i, unsigned);
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: arm_neon.h,v 1.12 2023/08/07 00:58:35 rin Exp $ */
|
||||
/* $NetBSD: arm_neon.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -26,8 +26,8 @@
|
|||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
|
||||
#define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H
|
||||
#ifndef _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
|
||||
#define _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
|
||||
|
@ -714,4 +714,4 @@ vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
|
|||
|
||||
#endif /* !defined(__aarch64__) */
|
||||
|
||||
#endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_H */
|
||||
#endif /* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_H */
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: arm_neon_imm.h,v 1.2 2020/08/09 01:59:04 riastradh Exp $ */
|
||||
/* $NetBSD: arm_neon_imm.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -26,8 +26,8 @@
|
|||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H
|
||||
#define _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H
|
||||
#ifndef _SYS_CRYPTO_ARCH_ARM_ARM_NEON_IMM_H
|
||||
#define _SYS_CRYPTO_ARCH_ARM_ARM_NEON_IMM_H
|
||||
|
||||
/*
|
||||
* Non-standard macros for writing ARM NEON vector literals. Needed
|
||||
|
@ -77,4 +77,4 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* _SYS_CRYPTO_AES_ARCH_ARM_ARM_NEON_IMM_H */
|
||||
#endif /* _SYS_CRYPTO_ARCH_ARM_ARM_NEON_IMM_H */
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: immintrin.h,v 1.5 2020/07/25 22:45:10 riastradh Exp $ */
|
||||
/* $NetBSD: immintrin.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -26,8 +26,8 @@
|
|||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H
|
||||
#define _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H
|
||||
#ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
|
||||
#define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
|
@ -348,4 +348,4 @@ _mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
|
|||
#endif
|
||||
}
|
||||
|
||||
#endif /* _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_H */
|
||||
#endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_H */
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: immintrin_ext.h,v 1.1 2020/06/29 23:47:54 riastradh Exp $ */
|
||||
/* $NetBSD: immintrin_ext.h,v 1.1 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -26,10 +26,10 @@
|
|||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H
|
||||
#define _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H
|
||||
#ifndef _SYS_CRYPTO_ARCH_X86_IMMINTRIN_EXT_H
|
||||
#define _SYS_CRYPTO_ARCH_X86_IMMINTRIN_EXT_H
|
||||
|
||||
#include "immintrin.h"
|
||||
#include <crypto/arch/x86/immintrin.h>
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
|
@ -45,4 +45,4 @@ _mm_storeu_epi8(void *__p, __m128i __v)
|
|||
((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
|
||||
}
|
||||
|
||||
#endif /* _SYS_CRYPTO_AES_ARCH_X86_IMMINTRIN_EXT_H */
|
||||
#endif /* _SYS_CRYPTO_ARCH_X86_IMMINTRIN_EXT_H */
|
|
@ -1,717 +0,0 @@
|
|||
/* $NetBSD: arm_neon.h,v 1.7 2020/09/07 18:06:13 jakllsch Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
|
||||
#define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
|
||||
#define _INTRINSATTR \
|
||||
__extension__ \
|
||||
__attribute__((__always_inline__, __gnu_inline__, __artificial__))
|
||||
|
||||
#ifdef __aarch64__
|
||||
typedef __Int32x4_t int32x4_t;
|
||||
typedef __Int64x2_t int64x2_t;
|
||||
typedef __Int8x16_t int8x16_t;
|
||||
typedef __Uint16x8_t uint16x8_t;
|
||||
typedef __Uint32x4_t uint32x4_t;
|
||||
typedef __Uint64x2_t uint64x2_t;
|
||||
typedef __Uint8x16_t uint8x16_t;
|
||||
typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
|
||||
#else
|
||||
typedef __simd128_int32_t int32x4_t;
|
||||
typedef __simd128_int64_t int64x2_t;
|
||||
typedef __simd128_int8_t int8x16_t;
|
||||
typedef __simd128_uint16_t uint16x8_t;
|
||||
typedef __simd128_uint32_t uint32x4_t;
|
||||
typedef __simd128_uint64_t uint64x2_t;
|
||||
typedef __simd128_uint8_t uint8x16_t;
|
||||
|
||||
typedef __simd64_int8_t int8x8_t;
|
||||
typedef __simd64_uint8_t uint8x8_t;
|
||||
typedef __builtin_neon_udi uint64x1_t;
|
||||
typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
|
||||
typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
|
||||
#endif
|
||||
|
||||
#if defined(__AARCH64EB__)
|
||||
#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - (__i))
|
||||
#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - (__i))
|
||||
#elif defined(__ARM_BIG_ENDIAN)
|
||||
#define __neon_lane_index(__v, __i) ((__i) ^ (__arraycount(__v) - 1))
|
||||
#define __neon_laneq_index(__v, __i) ((__i) ^ (__arraycount(__v)/2 - 1))
|
||||
#else
|
||||
#define __neon_lane_index(__v, __i) (__i)
|
||||
#define __neon_laneq_index(__v, __i) (__i)
|
||||
#endif
|
||||
|
||||
#elif defined(__clang__)
|
||||
|
||||
#define _INTRINSATTR \
|
||||
__attribute__((__always_inline__, __nodebug__))
|
||||
|
||||
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
|
||||
typedef __attribute__((neon_vector_type(2))) int64_t int64x2_t;
|
||||
typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
|
||||
|
||||
typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
|
||||
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
|
||||
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
|
||||
typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
|
||||
|
||||
typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
|
||||
|
||||
typedef __attribute__((neon_vector_type(8))) uint8_t uint8x8_t;
|
||||
|
||||
typedef struct { uint8x8_t val[2]; } uint8x8x2_t;
|
||||
typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
|
||||
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define __neon_lane_index(__v, __i) __i
|
||||
#define __neon_laneq_index(__v, __i) __i
|
||||
#else
|
||||
#define __neon_lane_index(__v, __i) (__arraycount(__v) - 1 - __i)
|
||||
#define __neon_laneq_index(__v, __i) (__arraycount(__v) - 1 - __i)
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#error Teach me how to neon in your compile!
|
||||
|
||||
#endif
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vaddq_u32(uint32x4_t __v0, uint32x4_t __v1)
|
||||
{
|
||||
return __v0 + __v1;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vcltq_s32(int32x4_t __v0, int32x4_t __v1)
|
||||
{
|
||||
return (uint32x4_t)(__v0 < __v1);
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline int32x4_t
|
||||
vdupq_n_s32(int32_t __x)
|
||||
{
|
||||
return (int32x4_t) { __x, __x, __x, __x };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vdupq_n_u32(uint32_t __x)
|
||||
{
|
||||
return (uint32x4_t) { __x, __x, __x, __x };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vdupq_n_u8(uint8_t __x)
|
||||
{
|
||||
return (uint8x16_t) {
|
||||
__x, __x, __x, __x, __x, __x, __x, __x,
|
||||
__x, __x, __x, __x, __x, __x, __x, __x,
|
||||
};
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vextq_u32(uint32x4_t __lo, uint32x4_t __hi, uint8_t __i)
|
||||
{
|
||||
#if defined(__AARCH64EB__) || defined(__ARM_BIG_ENDIAN)
|
||||
return __builtin_shuffle(__hi, __lo,
|
||||
(uint32x4_t) { 4 - __i, 5 - __i, 6 - __i, 7 - __i });
|
||||
#else
|
||||
return __builtin_shuffle(__lo, __hi,
|
||||
(uint32x4_t) { __i + 0, __i + 1, __i + 2, __i + 3 });
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define vextq_u32(__lo, __hi, __i) \
|
||||
(uint32x4_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
|
||||
(int8x16_t)(__hi), (__i), 50)
|
||||
#else
|
||||
#define vextq_u32(__lo, __hi, __i) ( \
|
||||
{ \
|
||||
uint32x4_t __tlo = (__lo); \
|
||||
uint32x4_t __thi = (__hi); \
|
||||
uint32x4_t __lo_r = __builtin_shufflevector(__tlo, __tlo, 3,2,1,0); \
|
||||
uint32x4_t __hi_r = __builtin_shufflevector(__thi, __thi, 3,2,1,0); \
|
||||
uint32x4_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
|
||||
(int8x16_t)__hi_r, __i, 50); \
|
||||
__builtin_shufflevector(__r, __r, 3,2,1,0); \
|
||||
})
|
||||
#endif /* __LITTLE_ENDIAN__ */
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vextq_u8(uint8x16_t __lo, uint8x16_t __hi, uint8_t __i)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
#if defined(__AARCH64EB__)
|
||||
return __builtin_shuffle(__hi, __lo,
|
||||
(uint8x16_t) {
|
||||
16 - __i, 17 - __i, 18 - __i, 19 - __i,
|
||||
20 - __i, 21 - __i, 22 - __i, 23 - __i,
|
||||
24 - __i, 25 - __i, 26 - __i, 27 - __i,
|
||||
28 - __i, 29 - __i, 30 - __i, 31 - __i,
|
||||
});
|
||||
#else
|
||||
return __builtin_shuffle(__lo, __hi,
|
||||
(uint8x16_t) {
|
||||
__i + 0, __i + 1, __i + 2, __i + 3,
|
||||
__i + 4, __i + 5, __i + 6, __i + 7,
|
||||
__i + 8, __i + 9, __i + 10, __i + 11,
|
||||
__i + 12, __i + 13, __i + 14, __i + 15,
|
||||
});
|
||||
#endif
|
||||
#else
|
||||
return (uint8x16_t)__builtin_neon_vextv16qi((int8x16_t)__lo,
|
||||
(int8x16_t)__hi, __i);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define vextq_u8(__lo, __hi, __i) \
|
||||
(uint8x16_t)__builtin_neon_vextq_v((int8x16_t)(__lo), \
|
||||
(int8x16_t)(__hi), (__i), 48)
|
||||
#else
|
||||
#define vextq_u8(__lo, __hi, __i) ( \
|
||||
{ \
|
||||
uint8x16_t __tlo = (__lo); \
|
||||
uint8x16_t __thi = (__hi); \
|
||||
uint8x16_t __lo_r = __builtin_shufflevector(__tlo, __tlo, \
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
|
||||
uint8x16_t __hi_r = __builtin_shufflevector(__thi, __thi, \
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
|
||||
uint8x16_t __r = __builtin_neon_vextq_v((int8x16_t)__lo_r, \
|
||||
(int8x16_t)__hi_r, (__i), 48); \
|
||||
__builtin_shufflevector(__r, __r, \
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); \
|
||||
})
|
||||
#endif /* __LITTLE_ENDIAN */
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint32_t
|
||||
vgetq_lane_u32(uint32x4_t __v, uint8_t __i)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
return __v[__neon_laneq_index(__v, __i)];
|
||||
#else
|
||||
return (uint32_t)__builtin_neon_vget_laneuv4si((int32x4_t)__v, __i);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#define vgetq_lane_u32(__v, __i) \
|
||||
(uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)(__v), \
|
||||
__neon_laneq_index(__v, __i))
|
||||
#endif
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vld1q_u32(const uint32_t *__p32)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#ifdef __aarch64__
|
||||
const __builtin_aarch64_simd_si *__p =
|
||||
(const __builtin_aarch64_simd_si *)__p32;
|
||||
|
||||
return (uint32x4_t)__builtin_aarch64_ld1v4si(__p);
|
||||
#else
|
||||
const __builtin_neon_si *__p = (const __builtin_neon_si *)__p32;
|
||||
|
||||
return (uint32x4_t)__builtin_neon_vld1v4si(__p);
|
||||
#endif
|
||||
#elif defined(__clang__)
|
||||
uint32x4_t __v = (uint32x4_t)__builtin_neon_vld1q_v(__p32, 50);
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
|
||||
#endif
|
||||
return __v;
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vld1q_u8(const uint8_t *__p8)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#ifdef __aarch64__
|
||||
const __builtin_aarch64_simd_qi *__p =
|
||||
(const __builtin_aarch64_simd_qi *)__p8;
|
||||
|
||||
return (uint8x16_t)__builtin_aarch64_ld1v16qi(__p);
|
||||
#else
|
||||
const __builtin_neon_qi *__p = (const __builtin_neon_qi *)__p8;
|
||||
|
||||
return (uint8x16_t)__builtin_neon_vld1v16qi(__p);
|
||||
#endif
|
||||
#elif defined(__clang__)
|
||||
uint8x16_t __v = (uint8x16_t)__builtin_neon_vld1q_v(__p8, 48);
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__v = __builtin_shufflevector(__v, __v,
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
return __v;
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vqtbl1q_u8(uint8x16_t __tab, uint8x16_t __idx)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#ifdef __aarch64__
|
||||
uint8x16_t __res;
|
||||
__asm__("tbl %0.16b, {%1.16b}, %2.16b"
|
||||
: "=w"(__res) : "w"(__tab), "w"(__idx));
|
||||
return __res;
|
||||
#else
|
||||
/*
|
||||
* No native ARMv7 NEON instruction for this, so do it via two
|
||||
* half-width TBLs instead (vtbl2_u8 equivalent).
|
||||
*/
|
||||
uint64x2_t __tab64 = (uint64x2_t)__tab;
|
||||
uint8x8_t __tablo = (uint8x8_t)__tab64[0];
|
||||
uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
|
||||
uint8x8x2_t __tab8x8x2 = { { __tablo, __tabhi } };
|
||||
union {
|
||||
uint8x8x2_t __u8x8x2;
|
||||
__builtin_neon_ti __ti;
|
||||
} __u = { __tab8x8x2 };
|
||||
uint64x2_t __idx64, __out64;
|
||||
int8x8_t __idxlo, __idxhi, __outlo, __outhi;
|
||||
|
||||
__idx64 = (uint64x2_t)__idx;
|
||||
__idxlo = (int8x8_t)__idx64[0];
|
||||
__idxhi = (int8x8_t)__idx64[1];
|
||||
__outlo = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxlo);
|
||||
__outhi = (int8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, __idxhi);
|
||||
__out64 = (uint64x2_t) { (uint64x1_t)__outlo, (uint64x1_t)__outhi };
|
||||
|
||||
return (uint8x16_t)__out64;
|
||||
#endif
|
||||
#elif defined(__clang__)
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__tab = __builtin_shufflevector(__tab, __tab,
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
|
||||
__idx = __builtin_shufflevector(__idx, __idx,
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
uint8x16_t __r;
|
||||
#ifdef __aarch64__
|
||||
__r = __builtin_neon_vqtbl1q_v((int8x16_t)__tab, (int8x16_t)__idx, 48);
|
||||
#else
|
||||
uint64x2_t __tab64 = (uint64x2_t)__tab;
|
||||
uint8x8_t __tablo = (uint8x8_t)__tab64[0];
|
||||
uint8x8_t __tabhi = (uint8x8_t)__tab64[1];
|
||||
uint64x2_t __idx64, __out64;
|
||||
int8x8_t __idxlo, __idxhi, __outlo, __outhi;
|
||||
|
||||
__idx64 = (uint64x2_t)__idx;
|
||||
__idxlo = (int8x8_t)__idx64[0];
|
||||
__idxhi = (int8x8_t)__idx64[1];
|
||||
__outlo = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
|
||||
(int8x8_t)__tabhi, (int8x8_t)__idxlo, 16);
|
||||
__outhi = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tablo,
|
||||
(int8x8_t)__tabhi, (int8x8_t)__idxhi, 16);
|
||||
__out64 = (uint64x2_t) { (uint64_t)__outlo, (uint64_t)__outhi };
|
||||
__r = (uint8x16_t)__out64;
|
||||
#endif
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__r = __builtin_shufflevector(__r, __r,
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
return __r;
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline int32x4_t
|
||||
vreinterpretq_s32_u8(uint8x16_t __v)
|
||||
{
|
||||
return (int32x4_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint16x8_t
|
||||
vreinterpretq_u16_u32(uint32x4_t __v)
|
||||
{
|
||||
return (uint16x8_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vreinterpretq_u32_u16(uint16x8_t __v)
|
||||
{
|
||||
return (uint32x4_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vreinterpretq_u32_u64(uint64x2_t __v)
|
||||
{
|
||||
return (uint32x4_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vreinterpretq_u32_u8(uint8x16_t __v)
|
||||
{
|
||||
return (uint32x4_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint64x2_t
|
||||
vreinterpretq_u64_u32(uint32x4_t __v)
|
||||
{
|
||||
return (uint64x2_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint64x2_t
|
||||
vreinterpretq_u64_u8(uint8x16_t __v)
|
||||
{
|
||||
return (uint64x2_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vreinterpretq_u8_s32(int32x4_t __v)
|
||||
{
|
||||
return (uint8x16_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vreinterpretq_u8_u32(uint32x4_t __v)
|
||||
{
|
||||
return (uint8x16_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vreinterpretq_u8_u64(uint64x2_t __v)
|
||||
{
|
||||
return (uint8x16_t)__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint16x8_t
|
||||
vrev32q_u16(uint16x8_t __v)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return __builtin_shuffle(__v, (uint16x8_t) { 1,0, 3,2, 5,4, 7,6 });
|
||||
#elif defined(__clang__)
|
||||
return __builtin_shufflevector(__v, __v, 1,0, 3,2, 5,4, 7,6);
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vrev32q_u8(uint8x16_t __v)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return __builtin_shuffle(__v,
|
||||
(uint8x16_t) { 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 });
|
||||
#elif defined(__clang__)
|
||||
return __builtin_shufflevector(__v, __v,
|
||||
3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vsetq_lane_u32(uint32_t __x, uint32x4_t __v, uint8_t __i)
|
||||
{
|
||||
__v[__neon_laneq_index(__v, __i)] = __x;
|
||||
return __v;
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#define vsetq_lane_u32(__x, __v, __i) \
|
||||
(uint32x4_t)__builtin_neon_vsetq_lane_i32((__x), (int32x4_t)(__v), \
|
||||
__neon_laneq_index(__v, __i))
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint64x2_t
|
||||
vsetq_lane_u64(uint64_t __x, uint64x2_t __v, uint8_t __i)
|
||||
{
|
||||
__v[__neon_laneq_index(__v, __i)] = __x;
|
||||
return __v;
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#define vsetq_lane_u64(__x, __v, __i) \
|
||||
(uint64x2_t)__builtin_neon_vsetq_lane_i64((__x), (int64x2_t)(__v), \
|
||||
__neon_laneq_index(__v, __i));
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline int32x4_t
|
||||
vshlq_n_s32(int32x4_t __v, uint8_t __bits)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
return (int32x4_t)__builtin_aarch64_ashlv4si(__v, __bits);
|
||||
#else
|
||||
return (int32x4_t)__builtin_neon_vshl_nv4si(__v, __bits);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#define vshlq_n_s32(__v, __bits) \
|
||||
(int32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 34)
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vshlq_n_u32(uint32x4_t __v, uint8_t __bits)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
return (uint32x4_t)__builtin_aarch64_ashlv4si((int32x4_t)__v, __bits);
|
||||
#else
|
||||
return (uint32x4_t)__builtin_neon_vshl_nv4si((int32x4_t)__v, __bits);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#define vshlq_n_u32(__v, __bits) \
|
||||
(uint32x4_t)__builtin_neon_vshlq_n_v((int32x4_t)(__v), (__bits), 50)
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vshrq_n_u32(uint32x4_t __v, uint8_t __bits)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
return (uint32x4_t)__builtin_aarch64_lshrv4si((int32x4_t)__v, __bits);
|
||||
#else
|
||||
return (uint32x4_t)__builtin_neon_vshru_nv4si((int32x4_t)__v, __bits);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#define vshrq_n_u32(__v, __bits) \
|
||||
(uint32x4_t)__builtin_neon_vshrq_n_v((int32x4_t)(__v), (__bits), 50)
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint8x16_t
|
||||
vshrq_n_u8(uint8x16_t __v, uint8_t __bits)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
return (uint8x16_t)__builtin_aarch64_lshrv16qi((int8x16_t)__v, __bits);
|
||||
#else
|
||||
return (uint8x16_t)__builtin_neon_vshru_nv16qi((int8x16_t)__v, __bits);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#define vshrq_n_u8(__v, __bits) \
|
||||
(uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)(__v), (__bits), 48)
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline int32x4_t
|
||||
vsliq_n_s32(int32x4_t __vins, int32x4_t __vsh, uint8_t __bits)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
return (int32x4_t)__builtin_aarch64_ssli_nv4si(__vins, __vsh, __bits);
|
||||
#else
|
||||
return (int32x4_t)__builtin_neon_vsli_nv4si(__vins, __vsh, __bits);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define vsliq_n_s32(__vins, __vsh, __bits) \
|
||||
(int32x4_t)__builtin_neon_vsliq_n_v((int32x4_t)(__vins), \
|
||||
(int32x4_t)(__vsh), (__bits), 34)
|
||||
#else
|
||||
#define vsliq_n_s32(__vins, __vsh, __bits) ( \
|
||||
{ \
|
||||
int32x4_t __tvins = (__vins); \
|
||||
int32x4_t __tvsh = (__vsh); \
|
||||
uint8_t __tbits = (__bits); \
|
||||
int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
|
||||
3,2,1,0); \
|
||||
int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
|
||||
3,2,1,0); \
|
||||
int32x4_t __r = __builtin_neon_vsliq_n_v(__tvins, __tvsh, __tbits, \
|
||||
34); \
|
||||
__builtin_shufflevector(__r, __r, 3,2,1,0); \
|
||||
})
|
||||
#endif /* __LITTLE_ENDIAN__ */
|
||||
#endif
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
_INTRINSATTR
|
||||
static __inline uint32x4_t
|
||||
vsriq_n_u32(uint32x4_t __vins, uint32x4_t __vsh, uint8_t __bits)
|
||||
{
|
||||
#ifdef __aarch64__
|
||||
return __builtin_aarch64_usri_nv4si_uuus(__vins, __vsh, __bits);
|
||||
#else
|
||||
return (uint32x4_t)__builtin_neon_vsri_nv4si((int32x4_t)__vins,
|
||||
(int32x4_t)__vsh, __bits);
|
||||
#endif
|
||||
}
|
||||
#elif defined(__clang__)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define vsriq_n_u32(__vins, __vsh, __bits) \
|
||||
(int32x4_t)__builtin_neon_vsriq_n_v((int32x4_t)(__vins), \
|
||||
(int32x4_t)(__vsh), (__bits), 34)
|
||||
#else
|
||||
#define vsriq_n_s32(__vins, __vsh, __bits) ( \
|
||||
{ \
|
||||
int32x4_t __tvins = (__vins); \
|
||||
int32x4_t __tvsh = (__vsh); \
|
||||
uint8_t __tbits = (__bits); \
|
||||
int32x4_t __vins_r = __builtin_shufflevector(__tvins, __tvins, \
|
||||
3,2,1,0); \
|
||||
int32x4_t __vsh_r = __builtin_shufflevector(__tvsh, __tvsh, \
|
||||
3,2,1,0); \
|
||||
int32x4_t __r = __builtin_neon_vsriq_n_v(__tvins, __tvsh, __tbits, \
|
||||
34); \
|
||||
__builtin_shufflevector(__r, __r, 3,2,1,0); \
|
||||
})
|
||||
#endif
|
||||
#endif
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline void
|
||||
vst1q_u32(uint32_t *__p32, uint32x4_t __v)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#ifdef __aarch64__
|
||||
__builtin_aarch64_simd_si *__p = (__builtin_aarch64_simd_si *)__p32;
|
||||
|
||||
__builtin_aarch64_st1v4si(__p, (int32x4_t)__v);
|
||||
#else
|
||||
__builtin_neon_si *__p = (__builtin_neon_si *)__p32;
|
||||
|
||||
__builtin_neon_vst1v4si(__p, (int32x4_t)__v);
|
||||
#endif
|
||||
#elif defined(__clang__)
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__v = __builtin_shufflevector(__v, __v, 3,2,1,0);
|
||||
#endif
|
||||
__builtin_neon_vst1q_v(__p32, __v, 50);
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline void
|
||||
vst1q_u8(uint8_t *__p8, uint8x16_t __v)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#ifdef __aarch64__
|
||||
__builtin_aarch64_simd_qi *__p = (__builtin_aarch64_simd_qi *)__p8;
|
||||
|
||||
__builtin_aarch64_st1v16qi(__p, (int8x16_t)__v);
|
||||
#else
|
||||
__builtin_neon_qi *__p = (__builtin_neon_qi *)__p8;
|
||||
|
||||
__builtin_neon_vst1v16qi(__p, (int8x16_t)__v);
|
||||
#endif
|
||||
#elif defined(__clang__)
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__v = __builtin_shufflevector(__v, __v,
|
||||
15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
__builtin_neon_vst1q_v(__p8, __v, 48);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef __aarch64__ /* XXX */
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x8_t
|
||||
vtbl1_u8(uint8x8_t __tab, uint8x8_t __idx)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return (uint8x8_t)__builtin_neon_vtbl1v8qi((int8x8_t)__tab,
|
||||
(int8x8_t)__idx);
|
||||
#elif defined(__clang__)
|
||||
uint8x8_t __ret;
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__tab = __builtin_shufflevector(__tab, __tab, 7,6,5,4,3,2,1,0);
|
||||
__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
__ret = (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__tab,
|
||||
(int8x8_t)__idx, 16);
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
return __ret;
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline uint8x8_t
|
||||
vtbl2_u8(uint8x8x2_t __tab, uint8x8_t __idx)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
union {
|
||||
uint8x8x2_t __u8x8x82;
|
||||
__builtin_neon_ti __ti;
|
||||
} __u = { __tab };
|
||||
return (uint8x8_t)__builtin_neon_vtbl2v8qi(__u.__ti, (int8x8_t)__idx);
|
||||
#elif defined(__clang__)
|
||||
uint8x8_t __ret;
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__tab.val[0] = __builtin_shufflevector(__tab.val[0], __tab.val[0],
|
||||
7,6,5,4,3,2,1,0);
|
||||
__tab.val[1] = __builtin_shufflevector(__tab.val[1], __tab.val[1],
|
||||
7,6,5,4,3,2,1,0);
|
||||
__idx = __builtin_shufflevector(__idx, __idx, 7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
__ret = (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__tab.val[0],
|
||||
(int8x8_t)__tab.val[1], (int8x8_t)__idx, 16);
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
__ret = __builtin_shufflevector(__ret, __ret, 7,6,5,4,3,2,1,0);
|
||||
#endif
|
||||
return __ret;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* !defined(__aarch64__) */
|
||||
|
||||
#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_H */
|
|
@ -1,80 +0,0 @@
|
|||
/* $NetBSD: arm_neon_imm.h,v 1.2 2020/08/09 01:59:04 riastradh Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H
|
||||
#define _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H
|
||||
|
||||
/*
|
||||
* Non-standard macros for writing ARM NEON vector literals. Needed
|
||||
* because apparently GCC and Clang disagree wildly on the ordering of
|
||||
* vector literal components -- and both disagree with the
|
||||
* architectural indexing!
|
||||
*/
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#if defined(__AARCH64EB__)
|
||||
#define V_N_U8(a,b,c,d,e,f,g,h) \
|
||||
{h,g,f,e,d,c,b,a}
|
||||
#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \
|
||||
{p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a}
|
||||
#define VQ_N_U32(a,b,c,d) \
|
||||
{d,c, b,a}
|
||||
#elif defined(__ARM_BIG_ENDIAN)
|
||||
#define V_N_U8(a,b,c,d,e,f,g,h) \
|
||||
{h,g,f,e,d,c,b,a}
|
||||
#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \
|
||||
{h,g,f,e,d,c,b,a, p,o,n,m,l,k,j,i}
|
||||
#define VQ_N_U32(a,b,c,d) \
|
||||
{b,a, d,c}
|
||||
#else
|
||||
#define V_N_U8(a,b,c,d,e,f,g,h) \
|
||||
{a,b,c,d,e,f,g,h}
|
||||
#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \
|
||||
{a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p}
|
||||
#define VQ_N_U32(a,b,c,d) \
|
||||
{a,b, c,d}
|
||||
#endif
|
||||
#elif defined(__clang__)
|
||||
#ifdef __LITTLE_ENDIAN__
|
||||
#define V_N_U8(a,b,c,d,e,f,g,h) \
|
||||
{a,b,c,d,e,f,g,h}
|
||||
#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \
|
||||
{a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p}
|
||||
#define VQ_N_U32(a,b,c,d) \
|
||||
{a,b, c,d}
|
||||
#else
|
||||
#define V_N_U8(a,b,c,d,e,f,g,h) \
|
||||
{h,g,f,e,d,c,b,a}
|
||||
#define VQ_N_U8(a,b,c,d,e,f,g,h, i,j,k,l,m,n,o,p) \
|
||||
{p,o,n,m,l,k,j,i, h,g,f,e,d,c,b,a}
|
||||
#define VQ_N_U32(a,b,c,d) \
|
||||
{d,c, b,a}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* _SYS_CRYPTO_CHACHA_ARCH_ARM_ARM_NEON_IMM_H */
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: chacha_neon.c,v 1.8 2020/08/08 14:47:01 riastradh Exp $ */
|
||||
/* $NetBSD: chacha_neon.c,v 1.9 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -29,8 +29,8 @@
|
|||
#include <sys/types.h>
|
||||
#include <sys/endian.h>
|
||||
|
||||
#include "arm_neon.h"
|
||||
#include "arm_neon_imm.h"
|
||||
#include <crypto/arch/arm/arm_neon.h>
|
||||
#include <crypto/arch/arm/arm_neon_imm.h>
|
||||
#include "chacha_neon.h"
|
||||
|
||||
/*
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* $NetBSD: chacha_sse2.c,v 1.2 2020/07/27 20:48:18 riastradh Exp $ */
|
||||
/* $NetBSD: chacha_sse2.c,v 1.3 2023/08/07 01:07:36 rin Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
|
@ -29,7 +29,7 @@
|
|||
#include <sys/types.h>
|
||||
#include <sys/endian.h>
|
||||
|
||||
#include "immintrin.h"
|
||||
#include <crypto/arch/x86/immintrin.h>
|
||||
|
||||
#include "chacha_sse2.h"
|
||||
|
||||
|
|
|
@ -1,351 +0,0 @@
|
|||
/* $NetBSD: immintrin.h,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
||||
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef _SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H
|
||||
#define _SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
/*
|
||||
* This kludgerous header file provides definitions for the Intel
|
||||
* intrinsics that work with GCC and Clang, because <immintrin.h> is
|
||||
* not available during the kernel build and arranging to make it
|
||||
* available is complicated. Please fix this properly!
|
||||
*/
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
|
||||
#define _INTRINSATTR \
|
||||
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
|
||||
#define _PACKALIAS
|
||||
|
||||
typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
|
||||
typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
|
||||
typedef long long __m128i_u
|
||||
__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
|
||||
typedef long long __v2di __attribute__((__vector_size__(16)));
|
||||
typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
|
||||
typedef int __v4si __attribute__((__vector_size__(16)));
|
||||
typedef unsigned __v4su __attribute__((__vector_size__(16)));
|
||||
typedef float __v4sf __attribute__((__vector_size__(16)));
|
||||
typedef short __v8hi __attribute__((__vector_size__(16)));
|
||||
typedef char __v16qi __attribute__((__vector_size__(16)));
|
||||
|
||||
#elif defined(__clang__)
|
||||
|
||||
typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef long long __m128i
|
||||
__attribute__((__vector_size__(16), __aligned__(16)));
|
||||
typedef long long __m128i_u
|
||||
__attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
|
||||
typedef long long __v2di __attribute__((__vector_size__(16)));
|
||||
typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
|
||||
typedef int __v4si __attribute__((__vector_size__(16)));
|
||||
typedef unsigned __v4su __attribute__((__vector_size__(16)));
|
||||
typedef float __v4sf __attribute__((__vector_size__(16)));
|
||||
typedef short __v8hi __attribute__((__vector_size__(16)));
|
||||
typedef char __v16qi __attribute__((__vector_size__(16)));
|
||||
|
||||
#define _INTRINSATTR \
|
||||
__attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
|
||||
__min_vector_width__(128)))
|
||||
#define _PACKALIAS \
|
||||
__attribute__((__packed__, __may_alias__))
|
||||
|
||||
#else
|
||||
|
||||
#error Please teach me how to do Intel intrinsics for your compiler!
|
||||
|
||||
#endif
|
||||
|
||||
#define _SSSE3_ATTR __attribute__((target("ssse3")))
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_add_epi32(__m128i __a, __m128i __b)
|
||||
{
|
||||
return (__m128i)((__v4su)__a + (__v4su)__b);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#define _mm_alignr_epi8(hi,lo,bytes) \
|
||||
(__m128i)__builtin_ia32_palignr128((__v2di)(__m128i)(hi), \
|
||||
(__v2di)(__m128i)(lo), 8*(int)(bytes))
|
||||
#elif defined(__clang__)
|
||||
#define _mm_alignr_epi8(hi,lo,bytes) \
|
||||
(__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(hi), \
|
||||
(__v16qi)(__m128i)(lo), (int)(bytes))
|
||||
#endif
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128
|
||||
_mm_load1_ps(const float *__p)
|
||||
{
|
||||
return __extension__ (__m128)(__v4sf) { *__p, *__p, *__p, *__p };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_loadu_si128(const __m128i_u *__p)
|
||||
{
|
||||
return ((const struct { __m128i_u __v; } _PACKALIAS *)__p)->__v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_loadu_si32(const void *__p)
|
||||
{
|
||||
int32_t __v = ((const struct { int32_t __v; } _PACKALIAS *)__p)->__v;
|
||||
return __extension__ (__m128i)(__v4si){ __v, 0, 0, 0 };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_loadu_si64(const void *__p)
|
||||
{
|
||||
int64_t __v = ((const struct { int64_t __v; } _PACKALIAS *)__p)->__v;
|
||||
return __extension__ (__m128i)(__v2di){ __v, 0 };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_load_si128(const __m128i *__p)
|
||||
{
|
||||
return *__p;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128
|
||||
_mm_movehl_ps(__m128 __v0, __m128 __v1)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return (__m128)__builtin_ia32_movhlps((__v4sf)__v0, (__v4sf)__v1);
|
||||
#elif defined(__clang__)
|
||||
return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 6,7,2,3);
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128
|
||||
_mm_movelh_ps(__m128 __v0, __m128 __v1)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return (__m128)__builtin_ia32_movlhps((__v4sf)__v0, (__v4sf)__v1);
|
||||
#elif defined(__clang__)
|
||||
return __builtin_shufflevector((__v4sf)__v0, (__v4sf)__v1, 0,1,4,5);
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_set1_epi16(int16_t __v)
|
||||
{
|
||||
return __extension__ (__m128i)(__v8hi){
|
||||
__v, __v, __v, __v, __v, __v, __v, __v
|
||||
};
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_set1_epi32(int32_t __v)
|
||||
{
|
||||
return __extension__ (__m128i)(__v4si){ __v, __v, __v, __v };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_set1_epi64x(int64_t __v)
|
||||
{
|
||||
return __extension__ (__m128i)(__v2di){ __v, __v };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_set_epi32(int32_t __v3, int32_t __v2, int32_t __v1, int32_t __v0)
|
||||
{
|
||||
return __extension__ (__m128i)(__v4si){ __v0, __v1, __v2, __v3 };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_set_epi64x(int64_t __v1, int64_t __v0)
|
||||
{
|
||||
return __extension__ (__m128i)(__v2di){ __v0, __v1 };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128
|
||||
_mm_setzero_ps(void)
|
||||
{
|
||||
return __extension__ (__m128){ 0, 0, 0, 0 };
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_setzero_si128(void)
|
||||
{
|
||||
return _mm_set1_epi64x(0);
|
||||
}
|
||||
|
||||
_INTRINSATTR _SSSE3_ATTR
|
||||
static __inline __m128i
|
||||
_mm_shuffle_epi8(__m128i __vtbl, __m128i __vidx)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__vtbl,
|
||||
(__v16qi)__vidx);
|
||||
}
|
||||
|
||||
#define _mm_shuffle_epi32(v,m) \
|
||||
(__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(v), (int)(m))
|
||||
|
||||
#define _mm_shuffle_ps(x,y,m) \
|
||||
(__m128)__builtin_ia32_shufps((__v4sf)(__m128)(x), \
|
||||
(__v4sf)(__m128)(y), (int)(m)) \
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_slli_epi32(__m128i __v, uint8_t __bits)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_pslldi128((__v4si)__v, (int)__bits);
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_slli_epi64(__m128i __v, uint8_t __bits)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_psllqi128((__v2di)__v, (int)__bits);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#define _mm_slli_si128(v,bytes) \
|
||||
(__m128i)__builtin_ia32_pslldqi128((__v2di)(__m128i)(v), \
|
||||
8*(int)(bytes))
|
||||
#elif defined(__clang__)
|
||||
#define _mm_slli_si128(v,bytes) \
|
||||
(__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(v), \
|
||||
(int)(bytes))
|
||||
#endif
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_srli_epi32(__m128i __v, uint8_t __bits)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_psrldi128((__v4si)__v, (int)__bits);
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_srli_epi64(__m128i __v, uint8_t __bits)
|
||||
{
|
||||
return (__m128i)__builtin_ia32_psrlqi128((__v2di)__v, (int)__bits);
|
||||
}
|
||||
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#define _mm_srli_si128(v,bytes) \
|
||||
(__m128i)__builtin_ia32_psrldqi128((__m128i)(v), 8*(int)(bytes))
|
||||
#elif defined(__clang__)
|
||||
#define _mm_srli_si128(v,bytes) \
|
||||
(__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(v), \
|
||||
(int)(bytes));
|
||||
#endif
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline void
|
||||
_mm_storeu_si128(__m128i_u *__p, __m128i __v)
|
||||
{
|
||||
((struct { __m128i_u __v; } _PACKALIAS *)__p)->__v = __v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline void
|
||||
_mm_storeu_si32(void *__p, __m128i __v)
|
||||
{
|
||||
((struct { int32_t __v; } _PACKALIAS *)__p)->__v = ((__v4si)__v)[0];
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline void
|
||||
_mm_storeu_si64(void *__p, __m128i __v)
|
||||
{
|
||||
((struct { int64_t __v; } _PACKALIAS *)__p)->__v = ((__v2di)__v)[0];
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline void
|
||||
_mm_store_si128(__m128i *__p, __m128i __v)
|
||||
{
|
||||
*__p = __v;
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_sub_epi64(__m128i __x, __m128i __y)
|
||||
{
|
||||
return (__m128i)((__v2du)__x - (__v2du)__y);
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_unpackhi_epi32(__m128i __lo, __m128i __hi)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return (__m128i)__builtin_ia32_punpckhdq128((__v4si)__lo,
|
||||
(__v4si)__hi);
|
||||
#elif defined(__clang__)
|
||||
return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
|
||||
2,6,3,7);
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_unpacklo_epi32(__m128i __lo, __m128i __hi)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return (__m128i)__builtin_ia32_punpckldq128((__v4si)__lo,
|
||||
(__v4si)__hi);
|
||||
#elif defined(__clang__)
|
||||
return (__m128i)__builtin_shufflevector((__v4si)__lo, (__v4si)__hi,
|
||||
0,4,1,5);
|
||||
#endif
|
||||
}
|
||||
|
||||
_INTRINSATTR
|
||||
static __inline __m128i
|
||||
_mm_unpacklo_epi64(__m128i __lo, __m128i __hi)
|
||||
{
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)__lo,
|
||||
(__v2di)__hi);
|
||||
#elif defined(__clang__)
|
||||
return (__m128i)__builtin_shufflevector((__v2di)__lo, (__v2di)__hi,
|
||||
0,2);
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* _SYS_CRYPTO_CHACHA_ARCH_X86_IMMINTRIN_H */
|
Loading…
Reference in New Issue