Merge pull request #7706 from SparkiDev/kyber_thumb2_asm

Kyber ASM ARMv7E-M/ARMv7-M: added assembly code
This commit is contained in:
David Garske 2024-10-03 10:56:42 -07:00 committed by GitHub
commit afe5209427
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 7843 additions and 48 deletions

View File

@ -1192,6 +1192,15 @@ endif
if BUILD_WC_KYBER
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber.c
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber_poly.c
if BUILD_ARMASM
if BUILD_ARM_THUMB
if BUILD_ARMASM_INLINE
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-kyber-asm_c.c
else
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-kyber-asm.S
endif !BUILD_ARMASM_INLINE
endif BUILD_ARM_THUMB
endif BUILD_ARMASM
if !BUILD_X86_ASM
if BUILD_INTELASM
src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber_asm.S

View File

@ -1511,7 +1511,7 @@ fe_cmov_table:
#endif /* WC_NO_CACHE_RESISTANT */
#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */
#endif /* HAVE_ED25519 */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
.text
.align 4
.globl fe_mul_op
@ -2023,7 +2023,7 @@ fe_mul_op:
POP {pc}
/* Cycle Count = 239 */
.size fe_mul_op,.-fe_mul_op
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
.text
.align 4
.globl fe_mul
@ -2034,7 +2034,7 @@ fe_mul:
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
/* Cycle Count = 24 */
.size fe_mul,.-fe_mul
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
.text
.align 4
.globl fe_sq_op
@ -2425,7 +2425,7 @@ fe_sq_op:
POP {pc}
/* Cycle Count = 179 */
.size fe_sq_op,.-fe_sq_op
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
.text
.align 4
.globl fe_sq
@ -2437,7 +2437,7 @@ fe_sq:
/* Cycle Count = 24 */
.size fe_sq,.-fe_sq
#ifdef HAVE_CURVE25519
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
.text
.align 4
.globl fe_mul121666
@ -2524,7 +2524,7 @@ fe_mul121666:
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
/* Cycle Count = 69 */
.size fe_mul121666,.-fe_mul121666
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#ifndef WC_NO_CACHE_RESISTANT
.text
.align 4
@ -3466,7 +3466,7 @@ L_fe_invert8:
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
/* Cycle Count = 292 */
.size fe_invert,.-fe_invert
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
.text
.align 4
.globl fe_sq2
@ -3925,7 +3925,7 @@ fe_sq2:
POP {pc}
/* Cycle Count = 213 */
.size fe_sq2,.-fe_sq2
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
.text
.align 4
.globl fe_pow22523
@ -4535,7 +4535,7 @@ ge_sub:
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
/* Cycle Count = 138 */
.size ge_sub,.-ge_sub
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
.text
.align 4
.globl sc_reduce
@ -5258,9 +5258,9 @@ sc_reduce:
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
/* Cycle Count = 502 */
.size sc_reduce,.-sc_reduce
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#ifdef HAVE_ED25519_SIGN
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
.text
.align 4
.globl sc_muladd
@ -6470,7 +6470,7 @@ sc_muladd:
POP {r4, r5, r6, r7, r8, r9, r10, r11, pc}
/* Cycle Count = 752 */
.size sc_muladd,.-sc_muladd
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#endif /* HAVE_ED25519_SIGN */
#endif /* HAVE_ED25519 */

View File

@ -1667,7 +1667,7 @@ void fe_cmov_table(fe* r, fe* base, signed char b)
#endif /* WC_NO_CACHE_RESISTANT */
#endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */
#endif /* HAVE_ED25519 */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
void fe_mul_op(void);
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void fe_mul_op()
@ -2193,7 +2193,7 @@ void fe_mul_op()
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void fe_mul(fe r_p, const fe a_p, const fe b_p)
#else
@ -2214,7 +2214,7 @@ void fe_mul(fe r, const fe a, const fe b)
);
}
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
void fe_sq_op(void);
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void fe_sq_op()
@ -2619,7 +2619,7 @@ void fe_sq_op()
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void fe_sq(fe r_p, const fe a_p)
#else
@ -2640,7 +2640,7 @@ void fe_sq(fe r, const fe a)
}
#ifdef HAVE_CURVE25519
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void fe_mul121666(fe r_p, fe a_p)
#else
@ -2745,7 +2745,7 @@ void fe_mul121666(fe r, fe a)
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#ifndef WC_NO_CACHE_RESISTANT
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
int curve25519(byte* r_p, const byte* n_p, const byte* a_p)
@ -3907,7 +3907,7 @@ void fe_invert(fe r, const fe a)
);
}
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void fe_sq2(fe r_p, const fe a_p)
#else
@ -4384,7 +4384,7 @@ void fe_sq2(fe r, const fe a)
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void fe_pow22523(fe r_p, const fe a_p)
#else
@ -5126,7 +5126,7 @@ void ge_sub(ge_p1p1 * r, const ge_p3 * p, const ge_cached* q)
);
}
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void sc_reduce(byte* s_p)
#else
@ -5865,9 +5865,9 @@ void sc_reduce(byte* s)
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#ifdef HAVE_ED25519_SIGN
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
#ifndef WOLFSSL_NO_VAR_ASSIGN_REG
void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p)
#else
@ -7099,7 +7099,7 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c)
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#endif /* HAVE_ED25519_SIGN */
#endif /* HAVE_ED25519 */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -67,17 +67,17 @@ L_poly1305_thumb2_16_loop:
ADCS r7, r7, r10
ADD r1, r1, #0x10
ADC r8, r8, r11
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
STM lr, {r4, r5, r6, r7, r8}
#else
/* h[0]-h[2] in r4-r6 for multiplication. */
STR r7, [lr, #12]
STR r8, [lr, #16]
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
STR r1, [sp, #16]
LDR r1, [sp, #12]
/* Multiply h by r */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
LDR r3, [r1]
EOR r0, r0, r0
@ -218,7 +218,7 @@ L_poly1305_thumb2_16_loop:
UMAAL r11, r12, r3, r5
/* DONE */
LDM sp, {r4, r5, r6}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
/* r12 will be zero because r is masked. */
/* Load length */
LDR r2, [sp, #20]

View File

@ -93,17 +93,17 @@ void poly1305_blocks_thumb2_16(Poly1305* ctx, const byte* m, word32 len, int not
"ADCS r7, r7, r10\n\t"
"ADD %[m], %[m], #0x10\n\t"
"ADC r8, r8, r11\n\t"
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
"STM lr, {r4, r5, r6, r7, r8}\n\t"
#else
/* h[0]-h[2] in r4-r6 for multiplication. */
"STR r7, [lr, #12]\n\t"
"STR r8, [lr, #16]\n\t"
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
"STR %[m], [sp, #16]\n\t"
"LDR %[m], [sp, #12]\n\t"
/* Multiply h by r */
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
/* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */
"LDR %[notLast], [%[m]]\n\t"
"EOR %[ctx], %[ctx], %[ctx]\n\t"
@ -244,7 +244,7 @@ void poly1305_blocks_thumb2_16(Poly1305* ctx, const byte* m, word32 len, int not
"UMAAL r11, r12, %[notLast], r5\n\t"
/* DONE */
"LDM sp, {r4, r5, r6}\n\t"
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
/* r12 will be zero because r is masked. */
/* Load length */
"LDR %[len], [sp, #20]\n\t"

View File

@ -240,7 +240,7 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a)
#define sp_2048_norm_64(a)
#ifndef WOLFSSL_SP_SMALL
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
/* Multiply a and b into r. (r = a * b)
*
* r A single precision integer.
@ -736,7 +736,7 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_d
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
/* Add b to a into r. (r = a + b)
*
* r A single precision integer.
@ -1533,7 +1533,7 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a,
(void)sp_2048_add_32(r + 96, r + 96, a1);
}
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
/* Square a and put result in r. (r = a * a)
*
* r A single precision integer.
@ -1899,7 +1899,7 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a)
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
/* Sub b from a into r. (r = a - b)
*
* r A single precision integer.
@ -31605,7 +31605,7 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
}
#else
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
/* Multiply a and b into r. (r = a * b)
*
* r A single precision integer.
@ -32101,7 +32101,7 @@ SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_di
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Square a and put result in r. (r = a * a)
@ -32222,7 +32222,7 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
}
#else
#ifdef WOLFSSL_SP_NO_UMAAL
#ifdef WOLFSSL_ARM_ARCH_7M
/* Square a and put result in r. (r = a * a)
*
* r A single precision integer.
@ -32588,7 +32588,7 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a)
);
}
#endif /* WOLFSSL_SP_NO_UMAAL */
#endif /* WOLFSSL_ARM_ARCH_7M */
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)

View File

@ -173,8 +173,16 @@ const sword16 zetas_inv[KYBER_N / 2] = {
3127, 3042, 1907, 1836, 1517, 359, 758, 1441
};
#define KYBER_BARRETT(a) \
"SMULWB r10, r14, " #a "\n\t" \
"SMULWT r11, r14, " #a "\n\t" \
"SMULBT r10, r12, r10\n\t" \
"SMULBT r11, r12, r11\n\t" \
"PKHBT r10, r10, r11, LSL #16\n\t" \
"SSUB16 " #a ", " #a ", r10\n\t"
#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM))
#if !(defined(__thumb__) || (defined(__aarch64__)) && defined(WOLFSSL_ARMASM))
/* Number-Theoretic Transform.
*
* @param [in, out] r Polynomial to transform.
@ -939,15 +947,16 @@ static void kyber_basemul(sword16* r, const sword16* a, const sword16* b,
*/
static void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b)
{
unsigned int i;
const sword16* zeta = zetas + 64;
#ifdef WOLFSSL_KYBER_SMALL
#if defined(WOLFSSL_KYBER_SMALL)
unsigned int i;
for (i = 0; i < KYBER_N; i += 4, zeta++) {
kyber_basemul(r + i + 0, a + i + 0, b + i + 0, zeta[0]);
kyber_basemul(r + i + 2, a + i + 2, b + i + 2, -zeta[0]);
}
#elif defined(WOLFSSL_KYBER_NO_LARGE_CODE)
unsigned int i;
for (i = 0; i < KYBER_N; i += 8, zeta += 2) {
kyber_basemul(r + i + 0, a + i + 0, b + i + 0, zeta[0]);
kyber_basemul(r + i + 2, a + i + 2, b + i + 2, -zeta[0]);
@ -955,6 +964,7 @@ static void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b)
kyber_basemul(r + i + 6, a + i + 6, b + i + 6, -zeta[1]);
}
#else
unsigned int i;
for (i = 0; i < KYBER_N; i += 16, zeta += 4) {
kyber_basemul(r + i + 0, a + i + 0, b + i + 0, zeta[0]);
kyber_basemul(r + i + 2, a + i + 2, b + i + 2, -zeta[0]);
@ -977,10 +987,10 @@ static void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b)
static void kyber_basemul_mont_add(sword16* r, const sword16* a,
const sword16* b)
{
unsigned int i;
const sword16* zeta = zetas + 64;
#ifdef WOLFSSL_KYBER_SMALL
#if defined(WOLFSSL_KYBER_SMALL)
unsigned int i;
for (i = 0; i < KYBER_N; i += 4, zeta++) {
sword16 t0[2];
sword16 t2[2];
@ -994,6 +1004,7 @@ static void kyber_basemul_mont_add(sword16* r, const sword16* a,
r[i + 3] += t2[1];
}
#elif defined(WOLFSSL_KYBER_NO_LARGE_CODE)
unsigned int i;
for (i = 0; i < KYBER_N; i += 8, zeta += 2) {
sword16 t0[2];
sword16 t2[2];
@ -1015,6 +1026,7 @@ static void kyber_basemul_mont_add(sword16* r, const sword16* a,
r[i + 7] += t6[1];
}
#else
unsigned int i;
for (i = 0; i < KYBER_N; i += 16, zeta += 4) {
sword16 t0[2];
sword16 t2[2];
@ -2142,7 +2154,7 @@ int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen)
}
#endif
#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__))
#if !(defined(WOLFSSL_ARMASM) && (defined(__aarch64__) || defined(__thumb__)))
/* Rejection sampling on uniform random bytes to generate uniform random
* integers mod q.
*
@ -3338,7 +3350,7 @@ int kyber_cmp(const byte* a, const byte* b, int sz)
/******************************************************************************/
#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM))
#if !(defined(__thumb__) || (defined(__aarch64__)) && defined(WOLFSSL_ARMASM))
/* Conditional subtraction of q to each coefficient of a polynomial.
*
@ -3355,10 +3367,14 @@ static KYBER_NOINLINE void kyber_csubq_c(sword16* p)
}
}
#else
#elif defined(__aarch64__)
#define kyber_csubq_c kyber_csubq_neon
#else
#define kyber_csubq_c kyber_thumb2_csubq
#endif
/******************************************************************************/

View File

@ -310,6 +310,22 @@ WOLFSSL_LOCAL int kyber_cmp_neon(const byte* a, const byte* b, int sz);
WOLFSSL_LOCAL void kyber_csubq_neon(sword16* p);
WOLFSSL_LOCAL void kyber_from_msg_neon(sword16* p, const byte* msg);
WOLFSSL_LOCAL void kyber_to_msg_neon(byte* msg, sword16* p);
#elif defined(__thumb__) && defined(WOLFSSL_ARMASM)
#define kyber_ntt kyber_thumb2_ntt
#define kyber_invntt kyber_thumb2_invntt
#define kyber_basemul_mont kyber_thumb2_basemul_mont
#define kyber_basemul_mont_add kyber_thumb2_basemul_mont_add
#define kyber_rej_uniform_c kyber_thumb2_rej_uniform
WOLFSSL_LOCAL void kyber_thumb2_ntt(sword16* r);
WOLFSSL_LOCAL void kyber_thumb2_invntt(sword16* r);
WOLFSSL_LOCAL void kyber_thumb2_basemul_mont(sword16* r, const sword16* a,
const sword16* b);
WOLFSSL_LOCAL void kyber_thumb2_basemul_mont_add(sword16* r, const sword16* a,
const sword16* b);
WOLFSSL_LOCAL void kyber_thumb2_csubq(sword16* p);
WOLFSSL_LOCAL unsigned int kyber_thumb2_rej_uniform(sword16* p,
unsigned int len, const byte* r, unsigned int rLen);
#endif
#ifdef __cplusplus