Unify implementation of carry-less multiply.
Accelerate carry-less multiply for 64x64->128. -----BEGIN PGP SIGNATURE----- iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmUEiPodHHJpY2hhcmQu aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/akgf/XkiIeErWJr1YXSbS YPQtCsDAfIrqn3RiyQ2uwSn2eeuwVqTFFPGER04YegRDK8dyO874JBfvOwmBT70J I/aU8Z4BbRyNu9nfaCtFMlXQH9KArAKcAds1PnshfcnI5T2yBloZ1sAU97IuJFZk Uuz96H60+ohc4wzaUiPqPhXQStgZeSYwwAJB0s25DhCckdea0udRCAJ1tQTVpxkM wIFef1SHPoM6DtMzFKHLLUH6VivSlHjqx8GqFusa7pVqfQyDzNBfwvDl1F/bkE07 yTocQEkV3QnZvIplhqUxAaZXIFZr9BNk7bDimMjHW6z3pNPN3T8zRn4trNjxbgPV jqzAtg== =8nnk -----END PGP SIGNATURE----- Merge tag 'pull-crypto-20230915' of https://gitlab.com/rth7680/qemu into staging Unify implementation of carry-less multiply. Accelerate carry-less multiply for 64x64->128. # -----BEGIN PGP SIGNATURE----- # # iQFRBAABCgA7FiEEekgeeIaLTbaoWgXAZN846K9+IV8FAmUEiPodHHJpY2hhcmQu # aGVuZGVyc29uQGxpbmFyby5vcmcACgkQZN846K9+IV/akgf/XkiIeErWJr1YXSbS # YPQtCsDAfIrqn3RiyQ2uwSn2eeuwVqTFFPGER04YegRDK8dyO874JBfvOwmBT70J # I/aU8Z4BbRyNu9nfaCtFMlXQH9KArAKcAds1PnshfcnI5T2yBloZ1sAU97IuJFZk # Uuz96H60+ohc4wzaUiPqPhXQStgZeSYwwAJB0s25DhCckdea0udRCAJ1tQTVpxkM # wIFef1SHPoM6DtMzFKHLLUH6VivSlHjqx8GqFusa7pVqfQyDzNBfwvDl1F/bkE07 # yTocQEkV3QnZvIplhqUxAaZXIFZr9BNk7bDimMjHW6z3pNPN3T8zRn4trNjxbgPV # jqzAtg== # =8nnk # -----END PGP SIGNATURE----- # gpg: Signature made Fri 15 Sep 2023 12:40:26 EDT # gpg: using RSA key 7A481E78868B4DB6A85A05C064DF38E8AF7E215F # gpg: issuer "richard.henderson@linaro.org" # gpg: Good signature from "Richard Henderson <richard.henderson@linaro.org>" [full] # Primary key fingerprint: 7A48 1E78 868B 4DB6 A85A 05C0 64DF 38E8 AF7E 215F * tag 'pull-crypto-20230915' of https://gitlab.com/rth7680/qemu: host/include/aarch64: Implement clmul.h host/include/i386: Implement clmul.h target/ppc: Use clmul_64 target/s390x: Use clmul_64 target/i386: Use clmul_64 target/arm: Use clmul_64 crypto: Add generic 64-bit carry-less multiply routine target/ppc: Use clmul_32* routines target/s390x: Use clmul_32* routines target/arm: Use clmul_32* routines crypto: Add generic 32-bit carry-less multiply routines target/ppc: Use clmul_16* routines target/s390x: Use clmul_16* routines target/arm: Use clmul_16* routines crypto: Add generic 16-bit carry-less multiply routines target/ppc: Use clmul_8* routines target/s390x: Use clmul_8* routines target/arm: Use clmul_8* routines crypto: Add generic 8-bit carry-less multiply routines Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
This commit is contained in:
commit
13d6b16081
111
crypto/clmul.c
Normal file
111
crypto/clmul.c
Normal file
@ -0,0 +1,111 @@
|
||||
/*
|
||||
* Carry-less multiply operations.
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*
|
||||
* Copyright (C) 2023 Linaro, Ltd.
|
||||
*/
|
||||
|
||||
#include "qemu/osdep.h"
|
||||
#include "crypto/clmul.h"
|
||||
|
||||
uint64_t clmul_8x8_low(uint64_t n, uint64_t m)
|
||||
{
|
||||
uint64_t r = 0;
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
uint64_t mask = (n & 0x0101010101010101ull) * 0xff;
|
||||
r ^= m & mask;
|
||||
m = (m << 1) & 0xfefefefefefefefeull;
|
||||
n >>= 1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static uint64_t clmul_8x4_even_int(uint64_t n, uint64_t m)
|
||||
{
|
||||
uint64_t r = 0;
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
uint64_t mask = (n & 0x0001000100010001ull) * 0xffff;
|
||||
r ^= m & mask;
|
||||
n >>= 1;
|
||||
m <<= 1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
uint64_t clmul_8x4_even(uint64_t n, uint64_t m)
|
||||
{
|
||||
n &= 0x00ff00ff00ff00ffull;
|
||||
m &= 0x00ff00ff00ff00ffull;
|
||||
return clmul_8x4_even_int(n, m);
|
||||
}
|
||||
|
||||
uint64_t clmul_8x4_odd(uint64_t n, uint64_t m)
|
||||
{
|
||||
return clmul_8x4_even(n >> 8, m >> 8);
|
||||
}
|
||||
|
||||
static uint64_t unpack_8_to_16(uint64_t x)
|
||||
{
|
||||
return (x & 0x000000ff)
|
||||
| ((x & 0x0000ff00) << 8)
|
||||
| ((x & 0x00ff0000) << 16)
|
||||
| ((x & 0xff000000) << 24);
|
||||
}
|
||||
|
||||
uint64_t clmul_8x4_packed(uint32_t n, uint32_t m)
|
||||
{
|
||||
return clmul_8x4_even_int(unpack_8_to_16(n), unpack_8_to_16(m));
|
||||
}
|
||||
|
||||
uint64_t clmul_16x2_even(uint64_t n, uint64_t m)
|
||||
{
|
||||
uint64_t r = 0;
|
||||
|
||||
n &= 0x0000ffff0000ffffull;
|
||||
m &= 0x0000ffff0000ffffull;
|
||||
|
||||
for (int i = 0; i < 16; ++i) {
|
||||
uint64_t mask = (n & 0x0000000100000001ull) * 0xffffffffull;
|
||||
r ^= m & mask;
|
||||
n >>= 1;
|
||||
m <<= 1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
uint64_t clmul_16x2_odd(uint64_t n, uint64_t m)
|
||||
{
|
||||
return clmul_16x2_even(n >> 16, m >> 16);
|
||||
}
|
||||
|
||||
uint64_t clmul_32(uint32_t n, uint32_t m32)
|
||||
{
|
||||
uint64_t r = 0;
|
||||
uint64_t m = m32;
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
r ^= n & 1 ? m : 0;
|
||||
n >>= 1;
|
||||
m <<= 1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
Int128 clmul_64_gen(uint64_t n, uint64_t m)
|
||||
{
|
||||
uint64_t rl = 0, rh = 0;
|
||||
|
||||
/* Bit 0 can only influence the low 64-bit result. */
|
||||
if (n & 1) {
|
||||
rl = m;
|
||||
}
|
||||
|
||||
for (int i = 1; i < 64; ++i) {
|
||||
uint64_t mask = -((n >> i) & 1);
|
||||
rl ^= (m << i) & mask;
|
||||
rh ^= (m >> (64 - i)) & mask;
|
||||
}
|
||||
return int128_make128(rl, rh);
|
||||
}
|
@ -48,9 +48,12 @@ if have_afalg
|
||||
endif
|
||||
crypto_ss.add(when: gnutls, if_true: files('tls-cipher-suites.c'))
|
||||
|
||||
util_ss.add(files('sm4.c'))
|
||||
util_ss.add(files('aes.c'))
|
||||
util_ss.add(files('init.c'))
|
||||
util_ss.add(files(
|
||||
'aes.c',
|
||||
'clmul.c',
|
||||
'init.c',
|
||||
'sm4.c',
|
||||
))
|
||||
if gnutls.found()
|
||||
util_ss.add(gnutls)
|
||||
endif
|
||||
|
@ -10,6 +10,7 @@
|
||||
#define CPUINFO_LSE (1u << 1)
|
||||
#define CPUINFO_LSE2 (1u << 2)
|
||||
#define CPUINFO_AES (1u << 3)
|
||||
#define CPUINFO_PMULL (1u << 4)
|
||||
|
||||
/* Initialized with a constructor. */
|
||||
extern unsigned cpuinfo;
|
||||
|
41
host/include/aarch64/host/crypto/clmul.h
Normal file
41
host/include/aarch64/host/crypto/clmul.h
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* AArch64 specific clmul acceleration.
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*/
|
||||
|
||||
#ifndef AARCH64_HOST_CRYPTO_CLMUL_H
|
||||
#define AARCH64_HOST_CRYPTO_CLMUL_H
|
||||
|
||||
#include "host/cpuinfo.h"
|
||||
#include <arm_neon.h>
|
||||
|
||||
/*
|
||||
* 64x64->128 pmull is available with FEAT_PMULL.
|
||||
* Both FEAT_AES and FEAT_PMULL are covered under the same macro.
|
||||
*/
|
||||
#ifdef __ARM_FEATURE_AES
|
||||
# define HAVE_CLMUL_ACCEL true
|
||||
#else
|
||||
# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PMULL)
|
||||
#endif
|
||||
#if !defined(__ARM_FEATURE_AES) && defined(CONFIG_ARM_AES_BUILTIN)
|
||||
# define ATTR_CLMUL_ACCEL __attribute__((target("+crypto")))
|
||||
#else
|
||||
# define ATTR_CLMUL_ACCEL
|
||||
#endif
|
||||
|
||||
static inline Int128 ATTR_CLMUL_ACCEL
|
||||
clmul_64_accel(uint64_t n, uint64_t m)
|
||||
{
|
||||
union { poly128_t v; Int128 s; } u;
|
||||
|
||||
#ifdef CONFIG_ARM_AES_BUILTIN
|
||||
u.v = vmull_p64((poly64_t)n, (poly64_t)m);
|
||||
#else
|
||||
asm(".arch_extension aes\n\t"
|
||||
"pmull %0.1q, %1.1d, %2.1d" : "=w"(u.v) : "w"(n), "w"(m));
|
||||
#endif
|
||||
return u.s;
|
||||
}
|
||||
|
||||
#endif /* AARCH64_HOST_CRYPTO_CLMUL_H */
|
15
host/include/generic/host/crypto/clmul.h
Normal file
15
host/include/generic/host/crypto/clmul.h
Normal file
@ -0,0 +1,15 @@
|
||||
/*
|
||||
* No host specific carry-less multiply acceleration.
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*/
|
||||
|
||||
#ifndef GENERIC_HOST_CRYPTO_CLMUL_H
|
||||
#define GENERIC_HOST_CRYPTO_CLMUL_H
|
||||
|
||||
#define HAVE_CLMUL_ACCEL false
|
||||
#define ATTR_CLMUL_ACCEL
|
||||
|
||||
Int128 clmul_64_accel(uint64_t, uint64_t)
|
||||
QEMU_ERROR("unsupported accel");
|
||||
|
||||
#endif /* GENERIC_HOST_CRYPTO_CLMUL_H */
|
@ -27,6 +27,7 @@
|
||||
#define CPUINFO_ATOMIC_VMOVDQA (1u << 16)
|
||||
#define CPUINFO_ATOMIC_VMOVDQU (1u << 17)
|
||||
#define CPUINFO_AES (1u << 18)
|
||||
#define CPUINFO_PCLMUL (1u << 19)
|
||||
|
||||
/* Initialized with a constructor. */
|
||||
extern unsigned cpuinfo;
|
||||
|
29
host/include/i386/host/crypto/clmul.h
Normal file
29
host/include/i386/host/crypto/clmul.h
Normal file
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* x86 specific clmul acceleration.
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*/
|
||||
|
||||
#ifndef X86_HOST_CRYPTO_CLMUL_H
|
||||
#define X86_HOST_CRYPTO_CLMUL_H
|
||||
|
||||
#include "host/cpuinfo.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
#if defined(__PCLMUL__)
|
||||
# define HAVE_CLMUL_ACCEL true
|
||||
# define ATTR_CLMUL_ACCEL
|
||||
#else
|
||||
# define HAVE_CLMUL_ACCEL likely(cpuinfo & CPUINFO_PCLMUL)
|
||||
# define ATTR_CLMUL_ACCEL __attribute__((target("pclmul")))
|
||||
#endif
|
||||
|
||||
static inline Int128 ATTR_CLMUL_ACCEL
|
||||
clmul_64_accel(uint64_t n, uint64_t m)
|
||||
{
|
||||
union { __m128i v; Int128 s; } u;
|
||||
|
||||
u.v = _mm_clmulepi64_si128(_mm_set_epi64x(0, n), _mm_set_epi64x(0, m), 0);
|
||||
return u.s;
|
||||
}
|
||||
|
||||
#endif /* X86_HOST_CRYPTO_CLMUL_H */
|
1
host/include/x86_64/host/crypto/clmul.h
Normal file
1
host/include/x86_64/host/crypto/clmul.h
Normal file
@ -0,0 +1 @@
|
||||
#include "host/include/i386/host/crypto/clmul.h"
|
83
include/crypto/clmul.h
Normal file
83
include/crypto/clmul.h
Normal file
@ -0,0 +1,83 @@
|
||||
/*
|
||||
* Carry-less multiply operations.
|
||||
* SPDX-License-Identifier: GPL-2.0-or-later
|
||||
*
|
||||
* Copyright (C) 2023 Linaro, Ltd.
|
||||
*/
|
||||
|
||||
#ifndef CRYPTO_CLMUL_H
|
||||
#define CRYPTO_CLMUL_H
|
||||
|
||||
#include "qemu/int128.h"
|
||||
#include "host/crypto/clmul.h"
|
||||
|
||||
/**
|
||||
* clmul_8x8_low:
|
||||
*
|
||||
* Perform eight 8x8->8 carry-less multiplies.
|
||||
*/
|
||||
uint64_t clmul_8x8_low(uint64_t, uint64_t);
|
||||
|
||||
/**
|
||||
* clmul_8x4_even:
|
||||
*
|
||||
* Perform four 8x8->16 carry-less multiplies.
|
||||
* The odd bytes of the inputs are ignored.
|
||||
*/
|
||||
uint64_t clmul_8x4_even(uint64_t, uint64_t);
|
||||
|
||||
/**
|
||||
* clmul_8x4_odd:
|
||||
*
|
||||
* Perform four 8x8->16 carry-less multiplies.
|
||||
* The even bytes of the inputs are ignored.
|
||||
*/
|
||||
uint64_t clmul_8x4_odd(uint64_t, uint64_t);
|
||||
|
||||
/**
|
||||
* clmul_8x4_packed:
|
||||
*
|
||||
* Perform four 8x8->16 carry-less multiplies.
|
||||
*/
|
||||
uint64_t clmul_8x4_packed(uint32_t, uint32_t);
|
||||
|
||||
/**
|
||||
* clmul_16x2_even:
|
||||
*
|
||||
* Perform two 16x16->32 carry-less multiplies.
|
||||
* The odd words of the inputs are ignored.
|
||||
*/
|
||||
uint64_t clmul_16x2_even(uint64_t, uint64_t);
|
||||
|
||||
/**
|
||||
* clmul_16x2_odd:
|
||||
*
|
||||
* Perform two 16x16->32 carry-less multiplies.
|
||||
* The even words of the inputs are ignored.
|
||||
*/
|
||||
uint64_t clmul_16x2_odd(uint64_t, uint64_t);
|
||||
|
||||
/**
|
||||
* clmul_32:
|
||||
*
|
||||
* Perform a 32x32->64 carry-less multiply.
|
||||
*/
|
||||
uint64_t clmul_32(uint32_t, uint32_t);
|
||||
|
||||
/**
|
||||
* clmul_64:
|
||||
*
|
||||
* Perform a 64x64->128 carry-less multiply.
|
||||
*/
|
||||
Int128 clmul_64_gen(uint64_t, uint64_t);
|
||||
|
||||
static inline Int128 clmul_64(uint64_t a, uint64_t b)
|
||||
{
|
||||
if (HAVE_CLMUL_ACCEL) {
|
||||
return clmul_64_accel(a, b);
|
||||
} else {
|
||||
return clmul_64_gen(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* CRYPTO_CLMUL_H */
|
@ -25,6 +25,9 @@
|
||||
#endif
|
||||
|
||||
/* Leaf 1, %ecx */
|
||||
#ifndef bit_PCLMUL
|
||||
#define bit_PCLMUL (1 << 1)
|
||||
#endif
|
||||
#ifndef bit_SSE4_1
|
||||
#define bit_SSE4_1 (1 << 19)
|
||||
#endif
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "exec/exec-all.h"
|
||||
#include "tcg/tcg.h"
|
||||
#include "fpu/softfloat.h"
|
||||
#include "crypto/clmul.h"
|
||||
|
||||
static uint16_t mve_eci_mask(CPUARMState *env)
|
||||
{
|
||||
@ -984,17 +985,10 @@ DO_2OP_L(vmulltuw, 1, 4, uint32_t, 8, uint64_t, DO_MUL)
|
||||
* Polynomial multiply. We can always do this generating 64 bits
|
||||
* of the result at a time, so we don't need to use DO_2OP_L.
|
||||
*/
|
||||
#define VMULLPH_MASK 0x00ff00ff00ff00ffULL
|
||||
#define VMULLPW_MASK 0x0000ffff0000ffffULL
|
||||
#define DO_VMULLPBH(N, M) pmull_h((N) & VMULLPH_MASK, (M) & VMULLPH_MASK)
|
||||
#define DO_VMULLPTH(N, M) DO_VMULLPBH((N) >> 8, (M) >> 8)
|
||||
#define DO_VMULLPBW(N, M) pmull_w((N) & VMULLPW_MASK, (M) & VMULLPW_MASK)
|
||||
#define DO_VMULLPTW(N, M) DO_VMULLPBW((N) >> 16, (M) >> 16)
|
||||
|
||||
DO_2OP(vmullpbh, 8, uint64_t, DO_VMULLPBH)
|
||||
DO_2OP(vmullpth, 8, uint64_t, DO_VMULLPTH)
|
||||
DO_2OP(vmullpbw, 8, uint64_t, DO_VMULLPBW)
|
||||
DO_2OP(vmullptw, 8, uint64_t, DO_VMULLPTW)
|
||||
DO_2OP(vmullpbh, 8, uint64_t, clmul_8x4_even)
|
||||
DO_2OP(vmullpth, 8, uint64_t, clmul_8x4_odd)
|
||||
DO_2OP(vmullpbw, 8, uint64_t, clmul_16x2_even)
|
||||
DO_2OP(vmullptw, 8, uint64_t, clmul_16x2_odd)
|
||||
|
||||
/*
|
||||
* Because the computation type is at least twice as large as required,
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include "tcg/tcg-gvec-desc.h"
|
||||
#include "fpu/softfloat.h"
|
||||
#include "qemu/int128.h"
|
||||
#include "crypto/clmul.h"
|
||||
#include "vec_internal.h"
|
||||
|
||||
/*
|
||||
@ -1986,21 +1987,11 @@ void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
*/
|
||||
void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
{
|
||||
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
||||
intptr_t i, opr_sz = simd_oprsz(desc);
|
||||
uint64_t *d = vd, *n = vn, *m = vm;
|
||||
|
||||
for (i = 0; i < opr_sz / 8; ++i) {
|
||||
uint64_t nn = n[i];
|
||||
uint64_t mm = m[i];
|
||||
uint64_t rr = 0;
|
||||
|
||||
for (j = 0; j < 8; ++j) {
|
||||
uint64_t mask = (nn & 0x0101010101010101ull) * 0xff;
|
||||
rr ^= mm & mask;
|
||||
mm = (mm << 1) & 0xfefefefefefefefeull;
|
||||
nn >>= 1;
|
||||
}
|
||||
d[i] = rr;
|
||||
d[i] = clmul_8x8_low(n[i], m[i]);
|
||||
}
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
@ -2012,84 +2003,28 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
*/
|
||||
void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
{
|
||||
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
||||
intptr_t i, opr_sz = simd_oprsz(desc);
|
||||
intptr_t hi = simd_data(desc);
|
||||
uint64_t *d = vd, *n = vn, *m = vm;
|
||||
|
||||
for (i = 0; i < opr_sz / 8; i += 2) {
|
||||
uint64_t nn = n[i + hi];
|
||||
uint64_t mm = m[i + hi];
|
||||
uint64_t rhi = 0;
|
||||
uint64_t rlo = 0;
|
||||
|
||||
/* Bit 0 can only influence the low 64-bit result. */
|
||||
if (nn & 1) {
|
||||
rlo = mm;
|
||||
}
|
||||
|
||||
for (j = 1; j < 64; ++j) {
|
||||
uint64_t mask = -((nn >> j) & 1);
|
||||
rlo ^= (mm << j) & mask;
|
||||
rhi ^= (mm >> (64 - j)) & mask;
|
||||
}
|
||||
d[i] = rlo;
|
||||
d[i + 1] = rhi;
|
||||
Int128 r = clmul_64(n[i + hi], m[i + hi]);
|
||||
d[i] = int128_getlo(r);
|
||||
d[i + 1] = int128_gethi(r);
|
||||
}
|
||||
clear_tail(d, opr_sz, simd_maxsz(desc));
|
||||
}
|
||||
|
||||
/*
|
||||
* 8x8->16 polynomial multiply.
|
||||
*
|
||||
* The byte inputs are expanded to (or extracted from) half-words.
|
||||
* Note that neon and sve2 get the inputs from different positions.
|
||||
* This allows 4 bytes to be processed in parallel with uint64_t.
|
||||
*/
|
||||
|
||||
static uint64_t expand_byte_to_half(uint64_t x)
|
||||
{
|
||||
return (x & 0x000000ff)
|
||||
| ((x & 0x0000ff00) << 8)
|
||||
| ((x & 0x00ff0000) << 16)
|
||||
| ((x & 0xff000000) << 24);
|
||||
}
|
||||
|
||||
uint64_t pmull_w(uint64_t op1, uint64_t op2)
|
||||
{
|
||||
uint64_t result = 0;
|
||||
int i;
|
||||
for (i = 0; i < 16; ++i) {
|
||||
uint64_t mask = (op1 & 0x0000000100000001ull) * 0xffffffff;
|
||||
result ^= op2 & mask;
|
||||
op1 >>= 1;
|
||||
op2 <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
uint64_t pmull_h(uint64_t op1, uint64_t op2)
|
||||
{
|
||||
uint64_t result = 0;
|
||||
int i;
|
||||
for (i = 0; i < 8; ++i) {
|
||||
uint64_t mask = (op1 & 0x0001000100010001ull) * 0xffff;
|
||||
result ^= op2 & mask;
|
||||
op1 >>= 1;
|
||||
op2 <<= 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
{
|
||||
int hi = simd_data(desc);
|
||||
uint64_t *d = vd, *n = vn, *m = vm;
|
||||
uint64_t nn = n[hi], mm = m[hi];
|
||||
|
||||
d[0] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
|
||||
d[0] = clmul_8x4_packed(nn, mm);
|
||||
nn >>= 32;
|
||||
mm >>= 32;
|
||||
d[1] = pmull_h(expand_byte_to_half(nn), expand_byte_to_half(mm));
|
||||
d[1] = clmul_8x4_packed(nn, mm);
|
||||
|
||||
clear_tail(d, 16, simd_maxsz(desc));
|
||||
}
|
||||
@ -2102,25 +2037,10 @@ void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
uint64_t *d = vd, *n = vn, *m = vm;
|
||||
|
||||
for (i = 0; i < opr_sz / 8; ++i) {
|
||||
uint64_t nn = (n[i] >> shift) & 0x00ff00ff00ff00ffull;
|
||||
uint64_t mm = (m[i] >> shift) & 0x00ff00ff00ff00ffull;
|
||||
|
||||
d[i] = pmull_h(nn, mm);
|
||||
d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
|
||||
}
|
||||
}
|
||||
|
||||
static uint64_t pmull_d(uint64_t op1, uint64_t op2)
|
||||
{
|
||||
uint64_t result = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 32; ++i) {
|
||||
uint64_t mask = -((op1 >> i) & 1);
|
||||
result ^= (op2 << i) & mask;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
{
|
||||
intptr_t sel = H4(simd_data(desc));
|
||||
@ -2129,7 +2049,7 @@ void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
|
||||
uint64_t *d = vd;
|
||||
|
||||
for (i = 0; i < opr_sz / 8; ++i) {
|
||||
d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
|
||||
d[i] = clmul_32(n[2 * i + sel], m[2 * i + sel]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -219,17 +219,6 @@ int16_t do_sqrdmlah_h(int16_t, int16_t, int16_t, bool, bool, uint32_t *);
|
||||
int32_t do_sqrdmlah_s(int32_t, int32_t, int32_t, bool, bool, uint32_t *);
|
||||
int64_t do_sqrdmlah_d(int64_t, int64_t, int64_t, bool, bool);
|
||||
|
||||
/*
|
||||
* 8 x 8 -> 16 vector polynomial multiply where the inputs are
|
||||
* in the low 8 bits of each 16-bit element
|
||||
*/
|
||||
uint64_t pmull_h(uint64_t op1, uint64_t op2);
|
||||
/*
|
||||
* 16 x 16 -> 32 vector polynomial multiply where the inputs are
|
||||
* in the low 16 bits of each 32-bit element
|
||||
*/
|
||||
uint64_t pmull_w(uint64_t op1, uint64_t op2);
|
||||
|
||||
/**
|
||||
* bfdotadd:
|
||||
* @sum: addend
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include "crypto/aes.h"
|
||||
#include "crypto/aes-round.h"
|
||||
#include "crypto/clmul.h"
|
||||
|
||||
#if SHIFT == 0
|
||||
#define Reg MMXReg
|
||||
@ -2122,41 +2123,18 @@ target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
|
||||
|
||||
#endif
|
||||
|
||||
#if SHIFT == 1
|
||||
static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
|
||||
uint64_t a, uint64_t b)
|
||||
{
|
||||
uint64_t al, ah, resh, resl;
|
||||
|
||||
ah = 0;
|
||||
al = a;
|
||||
resh = resl = 0;
|
||||
|
||||
while (b) {
|
||||
if (b & 1) {
|
||||
resl ^= al;
|
||||
resh ^= ah;
|
||||
}
|
||||
ah = (ah << 1) | (al >> 63);
|
||||
al <<= 1;
|
||||
b >>= 1;
|
||||
}
|
||||
|
||||
*dest_l = resl;
|
||||
*dest_h = resh;
|
||||
}
|
||||
#endif
|
||||
|
||||
void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
|
||||
uint32_t ctrl)
|
||||
{
|
||||
uint64_t a, b;
|
||||
int i;
|
||||
int a_idx = (ctrl & 1) != 0;
|
||||
int b_idx = (ctrl & 16) != 0;
|
||||
|
||||
for (i = 0; i < 1 << SHIFT; i += 2) {
|
||||
a = v->Q(((ctrl & 1) != 0) + i);
|
||||
b = s->Q(((ctrl & 16) != 0) + i);
|
||||
clmulq(&d->Q(i), &d->Q(i + 1), a, b);
|
||||
for (int i = 0; i < SHIFT; i++) {
|
||||
uint64_t a = v->Q(2 * i + a_idx);
|
||||
uint64_t b = s->Q(2 * i + b_idx);
|
||||
Int128 *r = (Int128 *)&d->ZMM_X(i);
|
||||
|
||||
*r = clmul_64(a, b);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "exec/helper-proto.h"
|
||||
#include "crypto/aes.h"
|
||||
#include "crypto/aes-round.h"
|
||||
#include "crypto/clmul.h"
|
||||
#include "fpu/softfloat.h"
|
||||
#include "qapi/error.h"
|
||||
#include "qemu/guest-random.h"
|
||||
@ -1424,46 +1425,39 @@ void helper_vbpermq(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
|
||||
#undef VBPERMQ_INDEX
|
||||
#undef VBPERMQ_DW
|
||||
|
||||
#define PMSUM(name, srcfld, trgfld, trgtyp) \
|
||||
void helper_##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \
|
||||
{ \
|
||||
int i, j; \
|
||||
trgtyp prod[sizeof(ppc_avr_t) / sizeof(a->srcfld[0])]; \
|
||||
\
|
||||
VECTOR_FOR_INORDER_I(i, srcfld) { \
|
||||
prod[i] = 0; \
|
||||
for (j = 0; j < sizeof(a->srcfld[0]) * 8; j++) { \
|
||||
if (a->srcfld[i] & (1ull << j)) { \
|
||||
prod[i] ^= ((trgtyp)b->srcfld[i] << j); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
VECTOR_FOR_INORDER_I(i, trgfld) { \
|
||||
r->trgfld[i] = prod[2 * i] ^ prod[2 * i + 1]; \
|
||||
} \
|
||||
/*
|
||||
* There is no carry across the two doublewords, so their order does
|
||||
* not matter. Nor is there partial overlap between registers.
|
||||
*/
|
||||
void helper_vpmsumb(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
|
||||
{
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
uint64_t aa = a->u64[i], bb = b->u64[i];
|
||||
r->u64[i] = clmul_8x4_even(aa, bb) ^ clmul_8x4_odd(aa, bb);
|
||||
}
|
||||
}
|
||||
|
||||
PMSUM(vpmsumb, u8, u16, uint16_t)
|
||||
PMSUM(vpmsumh, u16, u32, uint32_t)
|
||||
PMSUM(vpmsumw, u32, u64, uint64_t)
|
||||
void helper_vpmsumh(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
|
||||
{
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
uint64_t aa = a->u64[i], bb = b->u64[i];
|
||||
r->u64[i] = clmul_16x2_even(aa, bb) ^ clmul_16x2_odd(aa, bb);
|
||||
}
|
||||
}
|
||||
|
||||
void helper_vpmsumw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
|
||||
{
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
uint64_t aa = a->u64[i], bb = b->u64[i];
|
||||
r->u64[i] = clmul_32(aa, bb) ^ clmul_32(aa >> 32, bb >> 32);
|
||||
}
|
||||
}
|
||||
|
||||
void helper_VPMSUMD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
|
||||
{
|
||||
int i, j;
|
||||
Int128 tmp, prod[2] = {int128_zero(), int128_zero()};
|
||||
|
||||
for (j = 0; j < 64; j++) {
|
||||
for (i = 0; i < ARRAY_SIZE(r->u64); i++) {
|
||||
if (a->VsrD(i) & (1ull << j)) {
|
||||
tmp = int128_make64(b->VsrD(i));
|
||||
tmp = int128_lshift(tmp, j);
|
||||
prod[i] = int128_xor(prod[i], tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
r->s128 = int128_xor(prod[0], prod[1]);
|
||||
Int128 e = clmul_64(a->u64[0], b->u64[0]);
|
||||
Int128 o = clmul_64(a->u64[1], b->u64[1]);
|
||||
r->s128 = int128_xor(e, o);
|
||||
}
|
||||
|
||||
#if HOST_BIG_ENDIAN
|
||||
|
@ -14,19 +14,13 @@
|
||||
#include "vec.h"
|
||||
#include "exec/helper-proto.h"
|
||||
#include "tcg/tcg-gvec-desc.h"
|
||||
#include "crypto/clmul.h"
|
||||
|
||||
static bool s390_vec_is_zero(const S390Vector *v)
|
||||
{
|
||||
return !v->doubleword[0] && !v->doubleword[1];
|
||||
}
|
||||
|
||||
static void s390_vec_xor(S390Vector *res, const S390Vector *a,
|
||||
const S390Vector *b)
|
||||
{
|
||||
res->doubleword[0] = a->doubleword[0] ^ b->doubleword[0];
|
||||
res->doubleword[1] = a->doubleword[1] ^ b->doubleword[1];
|
||||
}
|
||||
|
||||
static void s390_vec_and(S390Vector *res, const S390Vector *a,
|
||||
const S390Vector *b)
|
||||
{
|
||||
@ -164,117 +158,105 @@ DEF_VCTZ(8)
|
||||
DEF_VCTZ(16)
|
||||
|
||||
/* like binary multiplication, but XOR instead of addition */
|
||||
#define DEF_GALOIS_MULTIPLY(BITS, TBITS) \
|
||||
static uint##TBITS##_t galois_multiply##BITS(uint##TBITS##_t a, \
|
||||
uint##TBITS##_t b) \
|
||||
{ \
|
||||
uint##TBITS##_t res = 0; \
|
||||
\
|
||||
while (b) { \
|
||||
if (b & 0x1) { \
|
||||
res = res ^ a; \
|
||||
} \
|
||||
a = a << 1; \
|
||||
b = b >> 1; \
|
||||
} \
|
||||
return res; \
|
||||
}
|
||||
DEF_GALOIS_MULTIPLY(8, 16)
|
||||
DEF_GALOIS_MULTIPLY(16, 32)
|
||||
DEF_GALOIS_MULTIPLY(32, 64)
|
||||
|
||||
static S390Vector galois_multiply64(uint64_t a, uint64_t b)
|
||||
/*
|
||||
* There is no carry across the two doublewords, so their order does
|
||||
* not matter. Nor is there partial overlap between registers.
|
||||
*/
|
||||
static inline uint64_t do_gfma8(uint64_t n, uint64_t m, uint64_t a)
|
||||
{
|
||||
S390Vector res = {};
|
||||
S390Vector va = {
|
||||
.doubleword[1] = a,
|
||||
};
|
||||
S390Vector vb = {
|
||||
.doubleword[1] = b,
|
||||
};
|
||||
|
||||
while (!s390_vec_is_zero(&vb)) {
|
||||
if (vb.doubleword[1] & 0x1) {
|
||||
s390_vec_xor(&res, &res, &va);
|
||||
}
|
||||
s390_vec_shl(&va, &va, 1);
|
||||
s390_vec_shr(&vb, &vb, 1);
|
||||
}
|
||||
return res;
|
||||
return clmul_8x4_even(n, m) ^ clmul_8x4_odd(n, m) ^ a;
|
||||
}
|
||||
|
||||
#define DEF_VGFM(BITS, TBITS) \
|
||||
void HELPER(gvec_vgfm##BITS)(void *v1, const void *v2, const void *v3, \
|
||||
uint32_t desc) \
|
||||
{ \
|
||||
int i; \
|
||||
\
|
||||
for (i = 0; i < (128 / TBITS); i++) { \
|
||||
uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \
|
||||
uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \
|
||||
uint##TBITS##_t d = galois_multiply##BITS(a, b); \
|
||||
\
|
||||
a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
|
||||
b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
|
||||
d = d ^ galois_multiply32(a, b); \
|
||||
s390_vec_write_element##TBITS(v1, i, d); \
|
||||
} \
|
||||
void HELPER(gvec_vgfm8)(void *v1, const void *v2, const void *v3, uint32_t d)
|
||||
{
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3;
|
||||
|
||||
q1[0] = do_gfma8(q2[0], q3[0], 0);
|
||||
q1[1] = do_gfma8(q2[1], q3[1], 0);
|
||||
}
|
||||
|
||||
void HELPER(gvec_vgfma8)(void *v1, const void *v2, const void *v3,
|
||||
const void *v4, uint32_t desc)
|
||||
{
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
|
||||
|
||||
q1[0] = do_gfma8(q2[0], q3[0], q4[0]);
|
||||
q1[1] = do_gfma8(q2[1], q3[1], q4[1]);
|
||||
}
|
||||
|
||||
static inline uint64_t do_gfma16(uint64_t n, uint64_t m, uint64_t a)
|
||||
{
|
||||
return clmul_16x2_even(n, m) ^ clmul_16x2_odd(n, m) ^ a;
|
||||
}
|
||||
|
||||
void HELPER(gvec_vgfm16)(void *v1, const void *v2, const void *v3, uint32_t d)
|
||||
{
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3;
|
||||
|
||||
q1[0] = do_gfma16(q2[0], q3[0], 0);
|
||||
q1[1] = do_gfma16(q2[1], q3[1], 0);
|
||||
}
|
||||
|
||||
void HELPER(gvec_vgfma16)(void *v1, const void *v2, const void *v3,
|
||||
const void *v4, uint32_t d)
|
||||
{
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
|
||||
|
||||
q1[0] = do_gfma16(q2[0], q3[0], q4[0]);
|
||||
q1[1] = do_gfma16(q2[1], q3[1], q4[1]);
|
||||
}
|
||||
|
||||
static inline uint64_t do_gfma32(uint64_t n, uint64_t m, uint64_t a)
|
||||
{
|
||||
return clmul_32(n, m) ^ clmul_32(n >> 32, m >> 32) ^ a;
|
||||
}
|
||||
|
||||
void HELPER(gvec_vgfm32)(void *v1, const void *v2, const void *v3, uint32_t d)
|
||||
{
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3;
|
||||
|
||||
q1[0] = do_gfma32(q2[0], q3[0], 0);
|
||||
q1[1] = do_gfma32(q2[1], q3[1], 0);
|
||||
}
|
||||
|
||||
void HELPER(gvec_vgfma32)(void *v1, const void *v2, const void *v3,
|
||||
const void *v4, uint32_t d)
|
||||
{
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
|
||||
|
||||
q1[0] = do_gfma32(q2[0], q3[0], q4[0]);
|
||||
q1[1] = do_gfma32(q2[1], q3[1], q4[1]);
|
||||
}
|
||||
DEF_VGFM(8, 16)
|
||||
DEF_VGFM(16, 32)
|
||||
DEF_VGFM(32, 64)
|
||||
|
||||
void HELPER(gvec_vgfm64)(void *v1, const void *v2, const void *v3,
|
||||
uint32_t desc)
|
||||
{
|
||||
S390Vector tmp1, tmp2;
|
||||
uint64_t a, b;
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3;
|
||||
Int128 r;
|
||||
|
||||
a = s390_vec_read_element64(v2, 0);
|
||||
b = s390_vec_read_element64(v3, 0);
|
||||
tmp1 = galois_multiply64(a, b);
|
||||
a = s390_vec_read_element64(v2, 1);
|
||||
b = s390_vec_read_element64(v3, 1);
|
||||
tmp2 = galois_multiply64(a, b);
|
||||
s390_vec_xor(v1, &tmp1, &tmp2);
|
||||
r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1]));
|
||||
q1[0] = int128_gethi(r);
|
||||
q1[1] = int128_getlo(r);
|
||||
}
|
||||
|
||||
#define DEF_VGFMA(BITS, TBITS) \
|
||||
void HELPER(gvec_vgfma##BITS)(void *v1, const void *v2, const void *v3, \
|
||||
const void *v4, uint32_t desc) \
|
||||
{ \
|
||||
int i; \
|
||||
\
|
||||
for (i = 0; i < (128 / TBITS); i++) { \
|
||||
uint##BITS##_t a = s390_vec_read_element##BITS(v2, i * 2); \
|
||||
uint##BITS##_t b = s390_vec_read_element##BITS(v3, i * 2); \
|
||||
uint##TBITS##_t d = galois_multiply##BITS(a, b); \
|
||||
\
|
||||
a = s390_vec_read_element##BITS(v2, i * 2 + 1); \
|
||||
b = s390_vec_read_element##BITS(v3, i * 2 + 1); \
|
||||
d = d ^ galois_multiply32(a, b); \
|
||||
d = d ^ s390_vec_read_element##TBITS(v4, i); \
|
||||
s390_vec_write_element##TBITS(v1, i, d); \
|
||||
} \
|
||||
}
|
||||
DEF_VGFMA(8, 16)
|
||||
DEF_VGFMA(16, 32)
|
||||
DEF_VGFMA(32, 64)
|
||||
|
||||
void HELPER(gvec_vgfma64)(void *v1, const void *v2, const void *v3,
|
||||
const void *v4, uint32_t desc)
|
||||
{
|
||||
S390Vector tmp1, tmp2;
|
||||
uint64_t a, b;
|
||||
uint64_t *q1 = v1;
|
||||
const uint64_t *q2 = v2, *q3 = v3, *q4 = v4;
|
||||
Int128 r;
|
||||
|
||||
a = s390_vec_read_element64(v2, 0);
|
||||
b = s390_vec_read_element64(v3, 0);
|
||||
tmp1 = galois_multiply64(a, b);
|
||||
a = s390_vec_read_element64(v2, 1);
|
||||
b = s390_vec_read_element64(v3, 1);
|
||||
tmp2 = galois_multiply64(a, b);
|
||||
s390_vec_xor(&tmp1, &tmp1, &tmp2);
|
||||
s390_vec_xor(v1, &tmp1, v4);
|
||||
r = int128_xor(clmul_64(q2[0], q3[0]), clmul_64(q2[1], q3[1]));
|
||||
q1[0] = q4[0] ^ int128_gethi(r);
|
||||
q1[1] = q4[1] ^ int128_getlo(r);
|
||||
}
|
||||
|
||||
#define DEF_VMAL(BITS) \
|
||||
|
@ -56,12 +56,14 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
|
||||
unsigned long hwcap = qemu_getauxval(AT_HWCAP);
|
||||
info |= (hwcap & HWCAP_ATOMICS ? CPUINFO_LSE : 0);
|
||||
info |= (hwcap & HWCAP_USCAT ? CPUINFO_LSE2 : 0);
|
||||
info |= (hwcap & HWCAP_AES ? CPUINFO_AES: 0);
|
||||
info |= (hwcap & HWCAP_AES ? CPUINFO_AES : 0);
|
||||
info |= (hwcap & HWCAP_PMULL ? CPUINFO_PMULL : 0);
|
||||
#endif
|
||||
#ifdef CONFIG_DARWIN
|
||||
info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE") * CPUINFO_LSE;
|
||||
info |= sysctl_for_bool("hw.optional.arm.FEAT_LSE2") * CPUINFO_LSE2;
|
||||
info |= sysctl_for_bool("hw.optional.arm.FEAT_AES") * CPUINFO_AES;
|
||||
info |= sysctl_for_bool("hw.optional.arm.FEAT_PMULL") * CPUINFO_PMULL;
|
||||
#endif
|
||||
|
||||
cpuinfo = info;
|
||||
|
@ -39,6 +39,7 @@ unsigned __attribute__((constructor)) cpuinfo_init(void)
|
||||
info |= (c & bit_SSE4_1 ? CPUINFO_SSE4 : 0);
|
||||
info |= (c & bit_MOVBE ? CPUINFO_MOVBE : 0);
|
||||
info |= (c & bit_POPCNT ? CPUINFO_POPCNT : 0);
|
||||
info |= (c & bit_PCLMUL ? CPUINFO_PCLMUL : 0);
|
||||
|
||||
/* Our AES support requires PSHUFB as well. */
|
||||
info |= ((c & bit_AES) && (c & bit_SSSE3) ? CPUINFO_AES : 0);
|
||||
|
Loading…
Reference in New Issue
Block a user