VIA AES: Batch AES-XTS computation into eight blocks at a time.
Experimental -- performance improvement is not clearly worth the complexity.
This commit is contained in:
parent
937bd5f179
commit
1f8a993cb5
@ -1,4 +1,4 @@
|
||||
/* $NetBSD: aes_via.c,v 1.1 2020/06/29 23:39:30 riastradh Exp $ */
|
||||
/* $NetBSD: aes_via.c,v 1.2 2020/06/29 23:41:35 riastradh Exp $ */
|
||||
|
||||
/*-
|
||||
* Copyright (c) 2020 The NetBSD Foundation, Inc.
|
||||
@ -27,7 +27,7 @@
|
||||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
__KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.1 2020/06/29 23:39:30 riastradh Exp $");
|
||||
__KERNEL_RCSID(1, "$NetBSD: aes_via.c,v 1.2 2020/06/29 23:41:35 riastradh Exp $");
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <sys/evcnt.h>
|
||||
@ -119,8 +119,8 @@ aesvia_setdeckey(struct aesdec *dec, const uint8_t *key, uint32_t nrounds)
|
||||
}
|
||||
|
||||
static inline void
|
||||
aesvia_enc1(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
uint8_t out[static 16], uint32_t cw0)
|
||||
aesvia_encN(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
uint8_t out[static 16], size_t nblocks, uint32_t cw0)
|
||||
{
|
||||
const uint32_t cw[4] __aligned(16) = {
|
||||
[0] = (cw0
|
||||
@ -128,7 +128,6 @@ aesvia_enc1(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
| C3_CRYPT_CWLO_ENCRYPT
|
||||
| C3_CRYPT_CWLO_NORMAL),
|
||||
};
|
||||
size_t nblocks = 1;
|
||||
|
||||
KASSERT(((uintptr_t)enc & 0xf) == 0);
|
||||
KASSERT(((uintptr_t)in & 0xf) == 0);
|
||||
@ -141,8 +140,8 @@ aesvia_enc1(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
}
|
||||
|
||||
static inline void
|
||||
aesvia_dec1(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
uint8_t out[static 16], uint32_t cw0)
|
||||
aesvia_decN(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
uint8_t out[static 16], size_t nblocks, uint32_t cw0)
|
||||
{
|
||||
const uint32_t cw[4] __aligned(16) = {
|
||||
[0] = (cw0
|
||||
@ -150,7 +149,6 @@ aesvia_dec1(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
| C3_CRYPT_CWLO_DECRYPT
|
||||
| C3_CRYPT_CWLO_NORMAL),
|
||||
};
|
||||
size_t nblocks = 1;
|
||||
|
||||
KASSERT(((uintptr_t)dec & 0xf) == 0);
|
||||
KASSERT(((uintptr_t)in & 0xf) == 0);
|
||||
@ -180,7 +178,7 @@ aesvia_enc(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
|
||||
((uintptr_t)in & 0xff0) != 0xff0) {
|
||||
enc_aligned_evcnt.ev_count++;
|
||||
aesvia_enc1(enc, in, out, cw0);
|
||||
aesvia_encN(enc, in, out, 1, cw0);
|
||||
} else {
|
||||
enc_unaligned_evcnt.ev_count++;
|
||||
/*
|
||||
@ -194,7 +192,7 @@ aesvia_enc(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
uint8_t outbuf[16] __aligned(16);
|
||||
|
||||
memcpy(inbuf, in, 16);
|
||||
aesvia_enc1(enc, inbuf, outbuf, cw0);
|
||||
aesvia_encN(enc, inbuf, outbuf, 1, cw0);
|
||||
memcpy(out, outbuf, 16);
|
||||
|
||||
explicit_memset(inbuf, 0, sizeof inbuf);
|
||||
@ -221,7 +219,7 @@ aesvia_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0 &&
|
||||
((uintptr_t)in & 0xff0) != 0xff0) {
|
||||
dec_aligned_evcnt.ev_count++;
|
||||
aesvia_dec1(dec, in, out, cw0);
|
||||
aesvia_decN(dec, in, out, 1, cw0);
|
||||
} else {
|
||||
dec_unaligned_evcnt.ev_count++;
|
||||
/*
|
||||
@ -235,7 +233,7 @@ aesvia_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
uint8_t outbuf[16] __aligned(16);
|
||||
|
||||
memcpy(inbuf, in, 16);
|
||||
aesvia_dec1(dec, inbuf, outbuf, cw0);
|
||||
aesvia_decN(dec, inbuf, outbuf, 1, cw0);
|
||||
memcpy(out, outbuf, 16);
|
||||
|
||||
explicit_memset(inbuf, 0, sizeof inbuf);
|
||||
@ -245,7 +243,7 @@ aesvia_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
}
|
||||
|
||||
static inline void
|
||||
aesvia_cbc_enc1(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
aesvia_cbc_encN(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
uint8_t out[static 16], size_t nblocks, uint8_t **ivp, uint32_t cw0)
|
||||
{
|
||||
const uint32_t cw[4] __aligned(16) = {
|
||||
@ -274,7 +272,7 @@ aesvia_cbc_enc1(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
}
|
||||
|
||||
static inline void
|
||||
aesvia_cbc_dec1(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
aesvia_cbc_decN(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
uint8_t out[static 16], size_t nblocks, uint8_t iv[static 16],
|
||||
uint32_t cw0)
|
||||
{
|
||||
@ -340,7 +338,7 @@ aesvia_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
|
||||
cbcenc_aligned_evcnt.ev_count++;
|
||||
uint8_t *ivp = iv;
|
||||
aesvia_cbc_enc1(enc, in, out, nbytes/16, &ivp, cw0);
|
||||
aesvia_cbc_encN(enc, in, out, nbytes/16, &ivp, cw0);
|
||||
memcpy(iv, ivp, 16);
|
||||
} else {
|
||||
cbcenc_unaligned_evcnt.ev_count++;
|
||||
@ -351,7 +349,7 @@ aesvia_cbc_enc(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
for (; nbytes; nbytes -= 16, in += 16, out += 16) {
|
||||
memcpy(tmp, in, 16);
|
||||
xor128(tmp, tmp, cv);
|
||||
aesvia_enc1(enc, tmp, cv, cw0);
|
||||
aesvia_encN(enc, tmp, cv, 1, cw0);
|
||||
memcpy(out, cv, 16);
|
||||
}
|
||||
memcpy(iv, cv, 16);
|
||||
@ -381,7 +379,7 @@ aesvia_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
aesvia_reload_keys();
|
||||
if ((((uintptr_t)in | (uintptr_t)out | (uintptr_t)iv) & 0xf) == 0) {
|
||||
cbcdec_aligned_evcnt.ev_count++;
|
||||
aesvia_cbc_dec1(dec, in, out, nbytes/16, iv, cw0);
|
||||
aesvia_cbc_decN(dec, in, out, nbytes/16, iv, cw0);
|
||||
} else {
|
||||
cbcdec_unaligned_evcnt.ev_count++;
|
||||
uint8_t iv0[16] __aligned(16);
|
||||
@ -393,7 +391,7 @@ aesvia_cbc_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
memcpy(iv, cv, 16);
|
||||
|
||||
for (;;) {
|
||||
aesvia_dec1(dec, cv, tmp, cw0);
|
||||
aesvia_decN(dec, cv, tmp, 1, cw0);
|
||||
if ((nbytes -= 16) == 0)
|
||||
break;
|
||||
memcpy(cv, in + nbytes - 16, 16);
|
||||
@ -480,6 +478,7 @@ aesvia_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
|
||||
xtsenc_aligned_evcnt.ev_count++;
|
||||
unsigned lastblock = 0;
|
||||
uint32_t buf[8*4] __aligned(16);
|
||||
|
||||
/*
|
||||
* Make sure the last block is not the last block of a
|
||||
@ -491,20 +490,43 @@ aesvia_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
|
||||
nbytes -= lastblock;
|
||||
|
||||
for (; nbytes; nbytes -= 16, in += 16, out += 16) {
|
||||
xor128(out, in, t);
|
||||
aesvia_enc1(enc, out, out, cw0);
|
||||
xor128(out, out, t);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
/*
|
||||
* Handle an odd number of initial blocks so we can
|
||||
* process the rest in eight-block (128-byte) chunks.
|
||||
*/
|
||||
if (nbytes % 128) {
|
||||
unsigned nbytes128 = nbytes % 128;
|
||||
|
||||
nbytes -= nbytes128;
|
||||
for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
|
||||
{
|
||||
xor128(out, in, t);
|
||||
aesvia_encN(enc, out, out, 1, cw0);
|
||||
xor128(out, out, t);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Process eight blocks at a time. */
|
||||
for (; nbytes; nbytes -= 128, in += 128, out += 128) {
|
||||
unsigned i;
|
||||
for (i = 0; i < 8; i++) {
|
||||
memcpy(buf + 4*i, t, 16);
|
||||
xor128(out + 4*i, in + 4*i, t);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
}
|
||||
aesvia_encN(enc, out, out, 8, cw0);
|
||||
for (i = 0; i < 8; i++)
|
||||
xor128(out + 4*i, in + 4*i, buf + 4*i);
|
||||
}
|
||||
|
||||
/* Handle the last block of a page, if necessary. */
|
||||
if (lastblock) {
|
||||
uint8_t buf[16] __aligned(16);
|
||||
xor128(buf, in, t);
|
||||
aesvia_enc1(enc, buf, out, cw0);
|
||||
explicit_memset(buf, 0, sizeof buf);
|
||||
aesvia_encN(enc, (const void *)buf, out, 1, cw0);
|
||||
}
|
||||
|
||||
explicit_memset(buf, 0, sizeof buf);
|
||||
} else {
|
||||
xtsenc_unaligned_evcnt.ev_count++;
|
||||
uint8_t buf[16] __aligned(16);
|
||||
@ -512,7 +534,7 @@ aesvia_xts_enc(const struct aesenc *enc, const uint8_t in[static 16],
|
||||
for (; nbytes; nbytes -= 16, in += 16, out += 16) {
|
||||
memcpy(buf, in, 16);
|
||||
xor128(buf, buf, t);
|
||||
aesvia_enc1(enc, buf, buf, cw0);
|
||||
aesvia_encN(enc, buf, buf, 1, cw0);
|
||||
xor128(buf, buf, t);
|
||||
memcpy(out, buf, 16);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
@ -550,6 +572,7 @@ aesvia_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
if ((((uintptr_t)in | (uintptr_t)out) & 0xf) == 0) {
|
||||
xtsdec_aligned_evcnt.ev_count++;
|
||||
unsigned lastblock = 0;
|
||||
uint32_t buf[8*4] __aligned(16);
|
||||
|
||||
/*
|
||||
* Make sure the last block is not the last block of a
|
||||
@ -561,20 +584,43 @@ aesvia_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
lastblock = 16*(((uintptr_t)(out + nbytes) & 0xfff) == 0);
|
||||
nbytes -= lastblock;
|
||||
|
||||
for (; nbytes; nbytes -= 16, in += 16, out += 16) {
|
||||
xor128(out, in, t);
|
||||
aesvia_dec1(dec, out, out, cw0);
|
||||
xor128(out, out, t);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
/*
|
||||
* Handle an odd number of initial blocks so we can
|
||||
* process the rest in eight-block (128-byte) chunks.
|
||||
*/
|
||||
if (nbytes % 128) {
|
||||
unsigned nbytes128 = nbytes % 128;
|
||||
|
||||
nbytes -= nbytes128;
|
||||
for (; nbytes128; nbytes128 -= 16, in += 16, out += 16)
|
||||
{
|
||||
xor128(out, in, t);
|
||||
aesvia_decN(dec, out, out, 1, cw0);
|
||||
xor128(out, out, t);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
}
|
||||
}
|
||||
|
||||
/* Process eight blocks at a time. */
|
||||
for (; nbytes; nbytes -= 128, in += 128, out += 128) {
|
||||
unsigned i;
|
||||
for (i = 0; i < 8; i++) {
|
||||
memcpy(buf + 4*i, t, 16);
|
||||
xor128(out + 4*i, in + 4*i, t);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
}
|
||||
aesvia_decN(dec, out, out, 8, cw0);
|
||||
for (i = 0; i < 8; i++)
|
||||
xor128(out + 4*i, in + 4*i, buf + 4*i);
|
||||
}
|
||||
|
||||
/* Handle the last block of a page, if necessary. */
|
||||
if (lastblock) {
|
||||
uint8_t buf[16] __aligned(16);
|
||||
xor128(buf, in, t);
|
||||
aesvia_dec1(dec, buf, out, cw0);
|
||||
explicit_memset(buf, 0, sizeof buf);
|
||||
aesvia_decN(dec, (const void *)buf, out, 1, cw0);
|
||||
}
|
||||
|
||||
explicit_memset(buf, 0, sizeof buf);
|
||||
} else {
|
||||
xtsdec_unaligned_evcnt.ev_count++;
|
||||
uint8_t buf[16] __aligned(16);
|
||||
@ -582,7 +628,7 @@ aesvia_xts_dec(const struct aesdec *dec, const uint8_t in[static 16],
|
||||
for (; nbytes; nbytes -= 16, in += 16, out += 16) {
|
||||
memcpy(buf, in, 16);
|
||||
xor128(buf, buf, t);
|
||||
aesvia_dec1(dec, buf, buf, cw0);
|
||||
aesvia_decN(dec, buf, buf, 1, cw0);
|
||||
xor128(buf, buf, t);
|
||||
memcpy(out, buf, 16);
|
||||
aesvia_xts_update(&t[0], &t[1], &t[2], &t[3]);
|
||||
|
Loading…
Reference in New Issue
Block a user