From d2047986d9dcb7702510b1dc18ca9e6b4efed398 Mon Sep 17 00:00:00 2001 From: Sean Parkinson Date: Wed, 3 Jul 2024 17:30:34 +1000 Subject: [PATCH] Kyber ASM ARMv7E-M/ARMv7-M: added assembly code Improved performance by reworking kyber_ntt, kyber_invtt, kyber_basemul_mont, kyber_basemul_mont_add, kyber_rej_uniform_c to be in assembly. Replace WOLFSSL_SP_NO_UMAAL with WOLFSSL_ARM_ARCH_7M --- src/include.am | 9 + wolfcrypt/src/port/arm/thumb2-curve25519.S | 24 +- wolfcrypt/src/port/arm/thumb2-curve25519_c.c | 24 +- wolfcrypt/src/port/arm/thumb2-kyber-asm.S | 3903 +++++++++++++++++ wolfcrypt/src/port/arm/thumb2-kyber-asm_c.c | 3851 ++++++++++++++++ wolfcrypt/src/port/arm/thumb2-poly1305-asm.S | 8 +- .../src/port/arm/thumb2-poly1305-asm_c.c | 8 +- wolfcrypt/src/sp_cortexm.c | 16 +- wolfcrypt/src/wc_kyber_poly.c | 32 +- wolfssl/wolfcrypt/wc_kyber.h | 16 + 10 files changed, 7843 insertions(+), 48 deletions(-) create mode 100644 wolfcrypt/src/port/arm/thumb2-kyber-asm.S create mode 100644 wolfcrypt/src/port/arm/thumb2-kyber-asm_c.c diff --git a/src/include.am b/src/include.am index 8e1f16f89..ee33cd956 100644 --- a/src/include.am +++ b/src/include.am @@ -1192,6 +1192,15 @@ endif if BUILD_WC_KYBER src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber.c src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber_poly.c +if BUILD_ARMASM +if BUILD_ARM_THUMB +if BUILD_ARMASM_INLINE +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-kyber-asm_c.c +else +src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/port/arm/thumb2-kyber-asm.S +endif !BUILD_ARMASM_INLINE +endif BUILD_ARM_THUMB +endif BUILD_ARMASM if !BUILD_X86_ASM if BUILD_INTELASM src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_kyber_asm.S diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519.S b/wolfcrypt/src/port/arm/thumb2-curve25519.S index 42da2f45f..239203e48 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519.S +++ b/wolfcrypt/src/port/arm/thumb2-curve25519.S @@ -1511,7 +1511,7 @@ fe_cmov_table: #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 .globl fe_mul_op @@ -2023,7 +2023,7 @@ fe_mul_op: POP {pc} /* Cycle Count = 239 */ .size fe_mul_op,.-fe_mul_op -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ .text .align 4 .globl fe_mul @@ -2034,7 +2034,7 @@ fe_mul: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 24 */ .size fe_mul,.-fe_mul -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 .globl fe_sq_op @@ -2425,7 +2425,7 @@ fe_sq_op: POP {pc} /* Cycle Count = 179 */ .size fe_sq_op,.-fe_sq_op -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ .text .align 4 .globl fe_sq @@ -2437,7 +2437,7 @@ fe_sq: /* Cycle Count = 24 */ .size fe_sq,.-fe_sq #ifdef HAVE_CURVE25519 -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 .globl fe_mul121666 @@ -2524,7 +2524,7 @@ fe_mul121666: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 69 */ .size fe_mul121666,.-fe_mul121666 -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #ifndef WC_NO_CACHE_RESISTANT .text .align 4 @@ -3466,7 +3466,7 @@ L_fe_invert8: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 292 */ .size fe_invert,.-fe_invert -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 .globl fe_sq2 @@ -3925,7 +3925,7 @@ fe_sq2: POP {pc} /* Cycle Count = 213 */ .size fe_sq2,.-fe_sq2 -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ .text .align 4 .globl fe_pow22523 @@ -4535,7 +4535,7 @@ ge_sub: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 138 */ .size ge_sub,.-ge_sub -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 .globl sc_reduce @@ -5258,9 +5258,9 @@ sc_reduce: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 502 */ .size sc_reduce,.-sc_reduce -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #ifdef HAVE_ED25519_SIGN -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M .text .align 4 .globl sc_muladd @@ -6470,7 +6470,7 @@ sc_muladd: POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} /* Cycle Count = 752 */ .size sc_muladd,.-sc_muladd -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ diff --git a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c index 21ad67bac..d7ca98a6c 100644 --- a/wolfcrypt/src/port/arm/thumb2-curve25519_c.c +++ b/wolfcrypt/src/port/arm/thumb2-curve25519_c.c @@ -1667,7 +1667,7 @@ void fe_cmov_table(fe* r, fe* base, signed char b) #endif /* WC_NO_CACHE_RESISTANT */ #endif /* HAVE_ED25519_MAKE_KEY || HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M void fe_mul_op(void); #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void fe_mul_op() @@ -2193,7 +2193,7 @@ void fe_mul_op() ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void fe_mul(fe r_p, const fe a_p, const fe b_p) #else @@ -2214,7 +2214,7 @@ void fe_mul(fe r, const fe a, const fe b) ); } -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M void fe_sq_op(void); #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void fe_sq_op() @@ -2619,7 +2619,7 @@ void fe_sq_op() ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void fe_sq(fe r_p, const fe a_p) #else @@ -2640,7 +2640,7 @@ void fe_sq(fe r, const fe a) } #ifdef HAVE_CURVE25519 -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void fe_mul121666(fe r_p, fe a_p) #else @@ -2745,7 +2745,7 @@ void fe_mul121666(fe r, fe a) ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #ifndef WC_NO_CACHE_RESISTANT #ifndef WOLFSSL_NO_VAR_ASSIGN_REG int curve25519(byte* r_p, const byte* n_p, const byte* a_p) @@ -3907,7 +3907,7 @@ void fe_invert(fe r, const fe a) ); } -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void fe_sq2(fe r_p, const fe a_p) #else @@ -4384,7 +4384,7 @@ void fe_sq2(fe r, const fe a) ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void fe_pow22523(fe r_p, const fe a_p) #else @@ -5126,7 +5126,7 @@ void ge_sub(ge_p1p1 * r, const ge_p3 * p, const ge_cached* q) ); } -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void sc_reduce(byte* s_p) #else @@ -5865,9 +5865,9 @@ void sc_reduce(byte* s) ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #ifdef HAVE_ED25519_SIGN -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M #ifndef WOLFSSL_NO_VAR_ASSIGN_REG void sc_muladd(byte* s_p, const byte* a_p, const byte* b_p, const byte* c_p) #else @@ -7099,7 +7099,7 @@ void sc_muladd(byte* s, const byte* a, const byte* b, const byte* c) ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #endif /* HAVE_ED25519_SIGN */ #endif /* HAVE_ED25519 */ diff --git a/wolfcrypt/src/port/arm/thumb2-kyber-asm.S b/wolfcrypt/src/port/arm/thumb2-kyber-asm.S new file mode 100644 index 000000000..93e0a53e9 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-kyber-asm.S @@ -0,0 +1,3903 @@ +/* thumb2-kyber-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./kyber/kyber.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-kyber-asm.S + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifndef WOLFSSL_ARMASM_INLINE + .thumb + .syntax unified +#ifdef WOLFSSL_WC_KYBER + .text + .type L_kyber_thumb2_ntt_zetas, %object + .size L_kyber_thumb2_ntt_zetas, 256 + .align 4 +L_kyber_thumb2_ntt_zetas: + .short 0x8ed + .short 0xa0b + .short 0xb9a + .short 0x714 + .short 0x5d5 + .short 0x58e + .short 0x11f + .short 0xca + .short 0xc56 + .short 0x26e + .short 0x629 + .short 0xb6 + .short 0x3c2 + .short 0x84f + .short 0x73f + .short 0x5bc + .short 0x23d + .short 0x7d4 + .short 0x108 + .short 0x17f + .short 0x9c4 + .short 0x5b2 + .short 0x6bf + .short 0xc7f + .short 0xa58 + .short 0x3f9 + .short 0x2dc + .short 0x260 + .short 0x6fb + .short 0x19b + .short 0xc34 + .short 0x6de + .short 0x4c7 + .short 0x28c + .short 0xad9 + .short 0x3f7 + .short 0x7f4 + .short 0x5d3 + .short 0xbe7 + .short 0x6f9 + .short 0x204 + .short 0xcf9 + .short 0xbc1 + .short 0xa67 + .short 0x6af + .short 0x877 + .short 0x7e + .short 0x5bd + .short 0x9ac + .short 0xca7 + .short 0xbf2 + .short 0x33e + .short 0x6b + .short 0x774 + .short 0xc0a + .short 0x94a + .short 0xb73 + .short 0x3c1 + .short 0x71d + .short 0xa2c + .short 0x1c0 + .short 0x8d8 + .short 0x2a5 + .short 0x806 + .short 0x8b2 + .short 0x1ae + .short 0x22b + .short 0x34b + .short 0x81e + .short 0x367 + .short 0x60e + .short 0x69 + .short 0x1a6 + .short 0x24b + .short 0xb1 + .short 0xc16 + .short 0xbde + .short 0xb35 + .short 0x626 + .short 0x675 + .short 0xc0b + .short 0x30a + .short 0x487 + .short 0xc6e + .short 0x9f8 + .short 0x5cb + .short 0xaa7 + .short 0x45f + .short 0x6cb + .short 0x284 + .short 0x999 + .short 0x15d + .short 0x1a2 + .short 0x149 + .short 0xc65 + .short 0xcb6 + .short 0x331 + .short 0x449 + .short 0x25b + .short 0x262 + .short 0x52a + .short 0x7fc + .short 0x748 + .short 0x180 + .short 0x842 + .short 0xc79 + .short 0x4c2 + .short 0x7ca + .short 0x997 + .short 0xdc + .short 0x85e + .short 0x686 + .short 0x860 + .short 0x707 + .short 0x803 + .short 0x31a + .short 0x71b + .short 0x9ab + .short 0x99b + .short 0x1de + .short 0xc95 + .short 0xbcd + .short 0x3e4 + .short 0x3df + .short 0x3be + .short 0x74d + .short 0x5f2 + .short 0x65c + .text + .align 4 + .globl kyber_thumb2_ntt + .type kyber_thumb2_ntt, %function +kyber_thumb2_ntt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x8 + ADR r1, L_kyber_thumb2_ntt_zetas +#ifndef WOLFSSL_ARM_ARCH_7M + MOV r12, #0xd01 + MOVT r12, #0xcff +#endif /* !WOLFSSL_ARM_ARCH_7M */ + MOV r2, #0x10 +L_kyber_thumb2_ntt_loop_123: + STR r2, [sp] + LDRH lr, [r1, #2] + LDR r2, [r0] + LDR r3, [r0, #64] + LDR r4, [r0, #128] + LDR r5, [r0, #192] + LDR r6, [r0, #256] + LDR r7, [r0, #320] + LDR r8, [r0, #384] + LDR r9, [r0, #448] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r6 + SMULBT r6, lr, r6 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r6 + SMLABB r11, r12, r11, r6 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r6, r2, r10 + SADD16 r2, r2, r10 +#else + SBFX r10, r6, #0, #16 + SBFX r11, lr, #0, #16 + ASR r6, r6, #16 + MUL r10, r11, r10 + MUL r6, r11, r6 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r6, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r6 + SUB r6, r2, r11 + ADD r2, r2, r11 + SUB r11, r2, r10, LSR #16 + ADD r10, r2, r10, LSR #16 + BFI r6, r11, #0, #16 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r7 + SMULBT r7, lr, r7 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r11, r12, r11, r7 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r7, r3, r10 + SADD16 r3, r3, r10 +#else + SBFX r10, r7, #0, #16 + SBFX r11, lr, #0, #16 + ASR r7, r7, #16 + MUL r10, r11, r10 + MUL r7, r11, r7 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r7 + SUB r7, r3, r11 + ADD r3, r3, r11 + SUB r11, r3, r10, LSR #16 + ADD r10, r3, r10, LSR #16 + BFI r7, r11, #0, #16 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r8 + SMULBT r8, lr, r8 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r11, r12, r11, r8 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r8, r4, r10 + SADD16 r4, r4, r10 +#else + SBFX r10, r8, #0, #16 + SBFX r11, lr, #0, #16 + ASR r8, r8, #16 + MUL r10, r11, r10 + MUL r8, r11, r8 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r8 + SUB r8, r4, r11 + ADD r4, r4, r11 + SUB r11, r4, r10, LSR #16 + ADD r10, r4, r10, LSR #16 + BFI r8, r11, #0, #16 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r9 + SMULBT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r11, r12, r11, r9 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r9, r5, r10 + SADD16 r5, r5, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, lr, #0, #16 + ASR r9, r9, #16 + MUL r10, r11, r10 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r9 + SUB r9, r5, r11 + ADD r5, r5, r11 + SUB r11, r5, r10, LSR #16 + ADD r10, r5, r10, LSR #16 + BFI r9, r11, #0, #16 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [r1, #4] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r4 + SMULBT r4, lr, r4 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r4 + SMLABB r11, r12, r11, r4 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r4, r2, r10 + SADD16 r2, r2, r10 +#else + SBFX r10, r4, #0, #16 + SBFX r11, lr, #0, #16 + ASR r4, r4, #16 + MUL r10, r11, r10 + MUL r4, r11, r4 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r4, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r4 + SUB r4, r2, r11 + ADD r2, r2, r11 + SUB r11, r2, r10, LSR #16 + ADD r10, r2, r10, LSR #16 + BFI r4, r11, #0, #16 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r5 + SMULBT r5, lr, r5 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r11, r12, r11, r5 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r5, r3, r10 + SADD16 r3, r3, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, lr, #0, #16 + ASR r5, r5, #16 + MUL r10, r11, r10 + MUL r5, r11, r5 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r5 + SUB r5, r3, r11 + ADD r3, r3, r11 + SUB r11, r3, r10, LSR #16 + ADD r10, r3, r10, LSR #16 + BFI r5, r11, #0, #16 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r8 + SMULTT r8, lr, r8 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r11, r12, r11, r8 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r8, r6, r10 + SADD16 r6, r6, r10 +#else + SBFX r10, r8, #0, #16 + SBFX r11, lr, #16, #16 + ASR r8, r8, #16 + MUL r10, r11, r10 + MUL r8, r11, r8 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r8 + SUB r8, r6, r11 + ADD r6, r6, r11 + SUB r11, r6, r10, LSR #16 + ADD r10, r6, r10, LSR #16 + BFI r8, r11, #0, #16 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r9 + SMULTT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r11, r12, r11, r9 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r9, r7, r10 + SADD16 r7, r7, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, lr, #16, #16 + ASR r9, r9, #16 + MUL r10, r11, r10 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r9 + SUB r9, r7, r11 + ADD r7, r7, r11 + SUB r11, r7, r10, LSR #16 + ADD r10, r7, r10, LSR #16 + BFI r9, r11, #0, #16 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [r1, #8] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r3 + SMULBT r3, lr, r3 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r3 + SMLABB r11, r12, r11, r3 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r3, r2, r10 + SADD16 r2, r2, r10 +#else + SBFX r10, r3, #0, #16 + SBFX r11, lr, #0, #16 + ASR r3, r3, #16 + MUL r10, r11, r10 + MUL r3, r11, r3 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r3, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r3 + SUB r3, r2, r11 + ADD r2, r2, r11 + SUB r11, r2, r10, LSR #16 + ADD r10, r2, r10, LSR #16 + BFI r3, r11, #0, #16 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r5 + SMULTT r5, lr, r5 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r11, r12, r11, r5 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r5, r4, r10 + SADD16 r4, r4, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, lr, #16, #16 + ASR r5, r5, #16 + MUL r10, r11, r10 + MUL r5, r11, r5 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r5 + SUB r5, r4, r11 + ADD r4, r4, r11 + SUB r11, r4, r10, LSR #16 + ADD r10, r4, r10, LSR #16 + BFI r5, r11, #0, #16 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [r1, #12] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r7 + SMULBT r7, lr, r7 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r11, r12, r11, r7 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r7, r6, r10 + SADD16 r6, r6, r10 +#else + SBFX r10, r7, #0, #16 + SBFX r11, lr, #0, #16 + ASR r7, r7, #16 + MUL r10, r11, r10 + MUL r7, r11, r7 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r7 + SUB r7, r6, r11 + ADD r6, r6, r11 + SUB r11, r6, r10, LSR #16 + ADD r10, r6, r10, LSR #16 + BFI r7, r11, #0, #16 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r9 + SMULTT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r11, r12, r11, r9 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r9, r8, r10 + SADD16 r8, r8, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, lr, #16, #16 + ASR r9, r9, #16 + MUL r10, r11, r10 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r9 + SUB r9, r8, r11 + ADD r8, r8, r11 + SUB r11, r8, r10, LSR #16 + ADD r10, r8, r10, LSR #16 + BFI r9, r11, #0, #16 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STR r2, [r0] + STR r3, [r0, #64] + STR r4, [r0, #128] + STR r5, [r0, #192] + STR r6, [r0, #256] + STR r7, [r0, #320] + STR r8, [r0, #384] + STR r9, [r0, #448] + LDR r2, [sp] + SUBS r2, r2, #0x1 + ADD r0, r0, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_ntt_loop_123 +#else + BNE.N L_kyber_thumb2_ntt_loop_123 +#endif + SUB r0, r0, #0x40 + MOV r3, #0x0 +L_kyber_thumb2_ntt_loop_4_j: + STR r3, [sp, #4] + ADD lr, r1, r3, LSR #4 + MOV r2, #0x4 + LDR lr, [lr, #16] +L_kyber_thumb2_ntt_loop_4_i: + STR r2, [sp] + LDR r2, [r0] + LDR r3, [r0, #16] + LDR r4, [r0, #32] + LDR r5, [r0, #48] + LDR r6, [r0, #64] + LDR r7, [r0, #80] + LDR r8, [r0, #96] + LDR r9, [r0, #112] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r4 + SMULBT r4, lr, r4 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r4 + SMLABB r11, r12, r11, r4 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r4, r2, r10 + SADD16 r2, r2, r10 +#else + SBFX r10, r4, #0, #16 + SBFX r11, lr, #0, #16 + ASR r4, r4, #16 + MUL r10, r11, r10 + MUL r4, r11, r4 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r4, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r4 + SUB r4, r2, r11 + ADD r2, r2, r11 + SUB r11, r2, r10, LSR #16 + ADD r10, r2, r10, LSR #16 + BFI r4, r11, #0, #16 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r5 + SMULBT r5, lr, r5 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r11, r12, r11, r5 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r5, r3, r10 + SADD16 r3, r3, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, lr, #0, #16 + ASR r5, r5, #16 + MUL r10, r11, r10 + MUL r5, r11, r5 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r5 + SUB r5, r3, r11 + ADD r3, r3, r11 + SUB r11, r3, r10, LSR #16 + ADD r10, r3, r10, LSR #16 + BFI r5, r11, #0, #16 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r8 + SMULTT r8, lr, r8 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r11, r12, r11, r8 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r8, r6, r10 + SADD16 r6, r6, r10 +#else + SBFX r10, r8, #0, #16 + SBFX r11, lr, #16, #16 + ASR r8, r8, #16 + MUL r10, r11, r10 + MUL r8, r11, r8 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r8 + SUB r8, r6, r11 + ADD r6, r6, r11 + SUB r11, r6, r10, LSR #16 + ADD r10, r6, r10, LSR #16 + BFI r8, r11, #0, #16 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r9 + SMULTT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r11, r12, r11, r9 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r9, r7, r10 + SADD16 r7, r7, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, lr, #16, #16 + ASR r9, r9, #16 + MUL r10, r11, r10 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r9 + SUB r9, r7, r11 + ADD r7, r7, r11 + SUB r11, r7, r10, LSR #16 + ADD r10, r7, r10, LSR #16 + BFI r9, r11, #0, #16 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STR r2, [r0] + STR r3, [r0, #16] + STR r4, [r0, #32] + STR r5, [r0, #48] + STR r6, [r0, #64] + STR r7, [r0, #80] + STR r8, [r0, #96] + STR r9, [r0, #112] + LDRD r2, r3, [sp] + SUBS r2, r2, #0x1 + ADD r0, r0, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_ntt_loop_4_i +#else + BNE.N L_kyber_thumb2_ntt_loop_4_i +#endif + ADD r3, r3, #0x40 + RSBS r10, r3, #0x100 + ADD r0, r0, #0x70 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_ntt_loop_4_j +#else + BNE.N L_kyber_thumb2_ntt_loop_4_j +#endif + SUB r0, r0, #0x200 + MOV r3, #0x0 +L_kyber_thumb2_ntt_loop_567: + ADD lr, r1, r3, LSR #3 + STR r3, [sp, #4] + LDRH lr, [lr, #32] + LDR r2, [r0] + LDR r3, [r0, #4] + LDR r4, [r0, #8] + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + LDR r9, [r0, #28] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r6 + SMULBT r6, lr, r6 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r6 + SMLABB r11, r12, r11, r6 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r6, r2, r10 + SADD16 r2, r2, r10 +#else + SBFX r10, r6, #0, #16 + SBFX r11, lr, #0, #16 + ASR r6, r6, #16 + MUL r10, r11, r10 + MUL r6, r11, r6 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r6, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r6 + SUB r6, r2, r11 + ADD r2, r2, r11 + SUB r11, r2, r10, LSR #16 + ADD r10, r2, r10, LSR #16 + BFI r6, r11, #0, #16 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r7 + SMULBT r7, lr, r7 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r11, r12, r11, r7 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r7, r3, r10 + SADD16 r3, r3, r10 +#else + SBFX r10, r7, #0, #16 + SBFX r11, lr, #0, #16 + ASR r7, r7, #16 + MUL r10, r11, r10 + MUL r7, r11, r7 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r7 + SUB r7, r3, r11 + ADD r3, r3, r11 + SUB r11, r3, r10, LSR #16 + ADD r10, r3, r10, LSR #16 + BFI r7, r11, #0, #16 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r8 + SMULBT r8, lr, r8 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r11, r12, r11, r8 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r8, r4, r10 + SADD16 r4, r4, r10 +#else + SBFX r10, r8, #0, #16 + SBFX r11, lr, #0, #16 + ASR r8, r8, #16 + MUL r10, r11, r10 + MUL r8, r11, r8 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r8 + SUB r8, r4, r11 + ADD r4, r4, r11 + SUB r11, r4, r10, LSR #16 + ADD r10, r4, r10, LSR #16 + BFI r8, r11, #0, #16 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r9 + SMULBT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r11, r12, r11, r9 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r9, r5, r10 + SADD16 r5, r5, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, lr, #0, #16 + ASR r9, r9, #16 + MUL r10, r11, r10 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r9 + SUB r9, r5, r11 + ADD r5, r5, r11 + SUB r11, r5, r10, LSR #16 + ADD r10, r5, r10, LSR #16 + BFI r9, r11, #0, #16 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [sp, #4] + ADD lr, r1, lr, LSR #2 + LDR lr, [lr, #64] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r4 + SMULBT r4, lr, r4 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r4 + SMLABB r11, r12, r11, r4 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r4, r2, r10 + SADD16 r2, r2, r10 +#else + SBFX r10, r4, #0, #16 + SBFX r11, lr, #0, #16 + ASR r4, r4, #16 + MUL r10, r11, r10 + MUL r4, r11, r4 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r4, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r4 + SUB r4, r2, r11 + ADD r2, r2, r11 + SUB r11, r2, r10, LSR #16 + ADD r10, r2, r10, LSR #16 + BFI r4, r11, #0, #16 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r5 + SMULBT r5, lr, r5 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r11, r12, r11, r5 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r5, r3, r10 + SADD16 r3, r3, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, lr, #0, #16 + ASR r5, r5, #16 + MUL r10, r11, r10 + MUL r5, r11, r5 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r5 + SUB r5, r3, r11 + ADD r3, r3, r11 + SUB r11, r3, r10, LSR #16 + ADD r10, r3, r10, LSR #16 + BFI r5, r11, #0, #16 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r8 + SMULTT r8, lr, r8 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r11, r12, r11, r8 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r8, r6, r10 + SADD16 r6, r6, r10 +#else + SBFX r10, r8, #0, #16 + SBFX r11, lr, #16, #16 + ASR r8, r8, #16 + MUL r10, r11, r10 + MUL r8, r11, r8 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r8 + SUB r8, r6, r11 + ADD r6, r6, r11 + SUB r11, r6, r10, LSR #16 + ADD r10, r6, r10, LSR #16 + BFI r8, r11, #0, #16 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r9 + SMULTT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r11, r12, r11, r9 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r9, r7, r10 + SADD16 r7, r7, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, lr, #16, #16 + ASR r9, r9, #16 + MUL r10, r11, r10 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r9 + SUB r9, r7, r11 + ADD r7, r7, r11 + SUB r11, r7, r10, LSR #16 + ADD r10, r7, r10, LSR #16 + BFI r9, r11, #0, #16 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [sp, #4] + ADD lr, r1, lr, LSR #1 + LDR lr, [lr, #128] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r3 + SMULBT r3, lr, r3 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r3 + SMLABB r11, r12, r11, r3 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r3, r2, r10 + SADD16 r2, r2, r10 +#else + SBFX r10, r3, #0, #16 + SBFX r11, lr, #0, #16 + ASR r3, r3, #16 + MUL r10, r11, r10 + MUL r3, r11, r3 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r3, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r3 + SUB r3, r2, r11 + ADD r2, r2, r11 + SUB r11, r2, r10, LSR #16 + ADD r10, r2, r10, LSR #16 + BFI r3, r11, #0, #16 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r5 + SMULTT r5, lr, r5 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r11, r12, r11, r5 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r5, r4, r10 + SADD16 r4, r4, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, lr, #16, #16 + ASR r5, r5, #16 + MUL r10, r11, r10 + MUL r5, r11, r5 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r5 + SUB r5, r4, r11 + ADD r4, r4, r11 + SUB r11, r4, r10, LSR #16 + ADD r10, r4, r10, LSR #16 + BFI r5, r11, #0, #16 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [sp, #4] + ADD lr, r1, lr, LSR #1 + LDR lr, [lr, #132] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r7 + SMULBT r7, lr, r7 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r11, r12, r11, r7 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r7, r6, r10 + SADD16 r6, r6, r10 +#else + SBFX r10, r7, #0, #16 + SBFX r11, lr, #0, #16 + ASR r7, r7, #16 + MUL r10, r11, r10 + MUL r7, r11, r7 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r7 + SUB r7, r6, r11 + ADD r6, r6, r11 + SUB r11, r6, r10, LSR #16 + ADD r10, r6, r10, LSR #16 + BFI r7, r11, #0, #16 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTB r10, lr, r9 + SMULTT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r11, r12, r11, r9 + PKHTB r10, r11, r10, ASR #16 + SSUB16 r9, r8, r10 + SADD16 r8, r8, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, lr, #16, #16 + ASR r9, r9, #16 + MUL r10, r11, r10 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r11, r12, r11, r9 + SUB r9, r8, r11 + ADD r8, r8, r11 + SUB r11, r8, r10, LSR #16 + ADD r10, r8, r10, LSR #16 + BFI r9, r11, #0, #16 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + MOV lr, #0xafc0 + MOVT lr, #0x13 +#else + MOV lr, #0x4ebf + MOV r12, #0xd01 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r2 + SMULWT r11, lr, r2 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r2, r2, r10 +#else + SBFX r10, r2, #0, #16 + SBFX r11, r2, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r2, r11, LSL #16 + SUB r2, r2, r10 + LSR r11, r11, #16 + BFI r2, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r3 + SMULWT r11, lr, r3 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r3, r3, r10 +#else + SBFX r10, r3, #0, #16 + SBFX r11, r3, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r3, r11, LSL #16 + SUB r3, r3, r10 + LSR r11, r11, #16 + BFI r3, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r4 + SMULWT r11, lr, r4 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r4, r4, r10 +#else + SBFX r10, r4, #0, #16 + SBFX r11, r4, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r4, r11, LSL #16 + SUB r4, r4, r10 + LSR r11, r11, #16 + BFI r4, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r5 + SMULWT r11, lr, r5 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r5, r5, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, r5, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r5, r11, LSL #16 + SUB r5, r5, r10 + LSR r11, r11, #16 + BFI r5, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r6 + SMULWT r11, lr, r6 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r6, r6, r10 +#else + SBFX r10, r6, #0, #16 + SBFX r11, r6, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r6, r11, LSL #16 + SUB r6, r6, r10 + LSR r11, r11, #16 + BFI r6, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r7 + SMULWT r11, lr, r7 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r7, r7, r10 +#else + SBFX r10, r7, #0, #16 + SBFX r11, r7, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r7, r11, LSL #16 + SUB r7, r7, r10 + LSR r11, r11, #16 + BFI r7, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r8 + SMULWT r11, lr, r8 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r8, r8, r10 +#else + SBFX r10, r8, #0, #16 + SBFX r11, r8, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r8, r11, LSL #16 + SUB r8, r8, r10 + LSR r11, r11, #16 + BFI r8, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r9 + SMULWT r11, lr, r9 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r9, r9, r10 +#else + SBFX r10, r9, #0, #16 + SBFX r11, r9, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r9, r11, LSL #16 + SUB r9, r9, r10 + LSR r11, r11, #16 + BFI r9, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + MOV r12, #0xd01 + MOVT r12, #0xcff +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STR r2, [r0] + STR r3, [r0, #4] + STR r4, [r0, #8] + STR r5, [r0, #12] + STR r6, [r0, #16] + STR r7, [r0, #20] + STR r8, [r0, #24] + STR r9, [r0, #28] + LDR r3, [sp, #4] + ADD r3, r3, #0x10 + RSBS r10, r3, #0x100 + ADD r0, r0, #0x20 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_ntt_loop_567 +#else + BNE.N L_kyber_thumb2_ntt_loop_567 +#endif + ADD sp, sp, #0x8 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 1270 */ + .size kyber_thumb2_ntt,.-kyber_thumb2_ntt + .text + .type L_kyber_thumb2_invntt_zetas_inv, %object + .size L_kyber_thumb2_invntt_zetas_inv, 256 + .align 4 +L_kyber_thumb2_invntt_zetas_inv: + .short 0x6a5 + .short 0x70f + .short 0x5b4 + .short 0x943 + .short 0x922 + .short 0x91d + .short 0x134 + .short 0x6c + .short 0xb23 + .short 0x366 + .short 0x356 + .short 0x5e6 + .short 0x9e7 + .short 0x4fe + .short 0x5fa + .short 0x4a1 + .short 0x67b + .short 0x4a3 + .short 0xc25 + .short 0x36a + .short 0x537 + .short 0x83f + .short 0x88 + .short 0x4bf + .short 0xb81 + .short 0x5b9 + .short 0x505 + .short 0x7d7 + .short 0xa9f + .short 0xaa6 + .short 0x8b8 + .short 0x9d0 + .short 0x4b + .short 0x9c + .short 0xbb8 + .short 0xb5f + .short 0xba4 + .short 0x368 + .short 0xa7d + .short 0x636 + .short 0x8a2 + .short 0x25a + .short 0x736 + .short 0x309 + .short 0x93 + .short 0x87a + .short 0x9f7 + .short 0xf6 + .short 0x68c + .short 0x6db + .short 0x1cc + .short 0x123 + .short 0xeb + .short 0xc50 + .short 0xab6 + .short 0xb5b + .short 0xc98 + .short 0x6f3 + .short 0x99a + .short 0x4e3 + .short 0x9b6 + .short 0xad6 + .short 0xb53 + .short 0x44f + .short 0x4fb + .short 0xa5c + .short 0x429 + .short 0xb41 + .short 0x2d5 + .short 0x5e4 + .short 0x940 + .short 0x18e + .short 0x3b7 + .short 0xf7 + .short 0x58d + .short 0xc96 + .short 0x9c3 + .short 0x10f + .short 0x5a + .short 0x355 + .short 0x744 + .short 0xc83 + .short 0x48a + .short 0x652 + .short 0x29a + .short 0x140 + .short 0x8 + .short 0xafd + .short 0x608 + .short 0x11a + .short 0x72e + .short 0x50d + .short 0x90a + .short 0x228 + .short 0xa75 + .short 0x83a + .short 0x623 + .short 0xcd + .short 0xb66 + .short 0x606 + .short 0xaa1 + .short 0xa25 + .short 0x908 + .short 0x2a9 + .short 0x82 + .short 0x642 + .short 0x74f + .short 0x33d + .short 0xb82 + .short 0xbf9 + .short 0x52d + .short 0xac4 + .short 0x745 + .short 0x5c2 + .short 0x4b2 + .short 0x93f + .short 0xc4b + .short 0x6d8 + .short 0xa93 + .short 0xab + .short 0xc37 + .short 0xbe2 + .short 0x773 + .short 0x72c + .short 0x5ed + .short 0x167 + .short 0x2f6 + .short 0x5a1 + .text + .align 4 + .globl kyber_thumb2_invntt + .type kyber_thumb2_invntt, %function +kyber_thumb2_invntt: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + SUB sp, sp, #0x8 + ADR r1, L_kyber_thumb2_invntt_zetas_inv +#ifndef WOLFSSL_ARM_ARCH_7M + MOV r12, #0xd01 + MOVT r12, #0xcff +#endif /* !WOLFSSL_ARM_ARCH_7M */ + MOV r3, #0x0 +L_kyber_thumb2_invntt_loop_765: + ADD lr, r1, r3, LSR #1 + STR r3, [sp, #4] + LDR r2, [r0] + LDR r3, [r0, #4] + LDR r4, [r0, #8] + LDR r5, [r0, #12] + LDR r6, [r0, #16] + LDR r7, [r0, #20] + LDR r8, [r0, #24] + LDR r9, [r0, #28] + LDR lr, [lr] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r2, r3 + SADD16 r2, r2, r3 + SMULBT r3, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r3 + SMLABB r3, r12, r11, r3 + PKHTB r3, r3, r10, ASR #16 +#else + SUB r11, r2, r3 + ADD r12, r2, r3 + BFC r3, #0, #16 + BFC r2, #0, #16 + SUB r10, r2, r3 + ADD r2, r2, r3 + BFI r10, r11, #0, #16 + BFI r2, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r3, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r3, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r3, r12, r11, r3 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r4, r5 + SADD16 r4, r4, r5 + SMULTT r5, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r5, r12, r11, r5 + PKHTB r5, r5, r10, ASR #16 +#else + SUB r11, r4, r5 + ADD r12, r4, r5 + BFC r5, #0, #16 + BFC r4, #0, #16 + SUB r10, r4, r5 + ADD r4, r4, r5 + BFI r10, r11, #0, #16 + BFI r4, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r5, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r5, r12, r11, r5 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [sp, #4] + ADD lr, r1, lr, LSR #1 + LDR lr, [lr, #4] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r6, r7 + SADD16 r6, r6, r7 + SMULBT r7, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r7, r12, r11, r7 + PKHTB r7, r7, r10, ASR #16 +#else + SUB r11, r6, r7 + ADD r12, r6, r7 + BFC r7, #0, #16 + BFC r6, #0, #16 + SUB r10, r6, r7 + ADD r6, r6, r7 + BFI r10, r11, #0, #16 + BFI r6, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r7, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r7, r12, r11, r7 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r8, r9 + SADD16 r8, r8, r9 + SMULTT r9, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SUB r11, r8, r9 + ADD r12, r8, r9 + BFC r9, #0, #16 + BFC r8, #0, #16 + SUB r10, r8, r9 + ADD r8, r8, r9 + BFI r10, r11, #0, #16 + BFI r8, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r9, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [sp, #4] + ADD lr, r1, lr, LSR #2 + LDR lr, [lr, #128] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r2, r4 + SADD16 r2, r2, r4 + SMULBT r4, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r4 + SMLABB r4, r12, r11, r4 + PKHTB r4, r4, r10, ASR #16 +#else + SUB r11, r2, r4 + ADD r12, r2, r4 + BFC r4, #0, #16 + BFC r2, #0, #16 + SUB r10, r2, r4 + ADD r2, r2, r4 + BFI r10, r11, #0, #16 + BFI r2, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r4, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r4, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r4, r12, r11, r4 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r3, r5 + SADD16 r3, r3, r5 + SMULBT r5, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r5, r12, r11, r5 + PKHTB r5, r5, r10, ASR #16 +#else + SUB r11, r3, r5 + ADD r12, r3, r5 + BFC r5, #0, #16 + BFC r3, #0, #16 + SUB r10, r3, r5 + ADD r3, r3, r5 + BFI r10, r11, #0, #16 + BFI r3, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r5, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r5, r12, r11, r5 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r6, r8 + SADD16 r6, r6, r8 + SMULTT r8, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r8, r12, r11, r8 + PKHTB r8, r8, r10, ASR #16 +#else + SUB r11, r6, r8 + ADD r12, r6, r8 + BFC r8, #0, #16 + BFC r6, #0, #16 + SUB r10, r6, r8 + ADD r6, r6, r8 + BFI r10, r11, #0, #16 + BFI r6, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r8, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r8, r12, r11, r8 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r7, r9 + SADD16 r7, r7, r9 + SMULTT r9, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SUB r11, r7, r9 + ADD r12, r7, r9 + BFC r9, #0, #16 + BFC r7, #0, #16 + SUB r10, r7, r9 + ADD r7, r7, r9 + BFI r10, r11, #0, #16 + BFI r7, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r9, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [sp, #4] + ADD lr, r1, lr, LSR #3 + LDR lr, [lr, #192] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r2, r6 + SADD16 r2, r2, r6 + SMULBT r6, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r6 + SMLABB r6, r12, r11, r6 + PKHTB r6, r6, r10, ASR #16 +#else + SUB r11, r2, r6 + ADD r12, r2, r6 + BFC r6, #0, #16 + BFC r2, #0, #16 + SUB r10, r2, r6 + ADD r2, r2, r6 + BFI r10, r11, #0, #16 + BFI r2, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r6, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r6, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r6, r12, r11, r6 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r3, r7 + SADD16 r3, r3, r7 + SMULBT r7, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r7, r12, r11, r7 + PKHTB r7, r7, r10, ASR #16 +#else + SUB r11, r3, r7 + ADD r12, r3, r7 + BFC r7, #0, #16 + BFC r3, #0, #16 + SUB r10, r3, r7 + ADD r3, r3, r7 + BFI r10, r11, #0, #16 + BFI r3, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r7, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r7, r12, r11, r7 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r4, r8 + SADD16 r4, r4, r8 + SMULBT r8, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r8, r12, r11, r8 + PKHTB r8, r8, r10, ASR #16 +#else + SUB r11, r4, r8 + ADD r12, r4, r8 + BFC r8, #0, #16 + BFC r4, #0, #16 + SUB r10, r4, r8 + ADD r4, r4, r8 + BFI r10, r11, #0, #16 + BFI r4, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r8, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r8, r12, r11, r8 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r5, r9 + SADD16 r5, r5, r9 + SMULBT r9, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SUB r11, r5, r9 + ADD r12, r5, r9 + BFC r9, #0, #16 + BFC r5, #0, #16 + SUB r10, r5, r9 + ADD r5, r5, r9 + BFI r10, r11, #0, #16 + BFI r5, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r9, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + MOV lr, #0xafc0 + MOVT lr, #0x13 +#else + MOV lr, #0x4ebf +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r2 + SMULWT r11, lr, r2 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r2, r2, r10 +#else + SBFX r10, r2, #0, #16 + SBFX r11, r2, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r2, r11, LSL #16 + SUB r2, r2, r10 + LSR r11, r11, #16 + BFI r2, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r3 + SMULWT r11, lr, r3 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r3, r3, r10 +#else + SBFX r10, r3, #0, #16 + SBFX r11, r3, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r3, r11, LSL #16 + SUB r3, r3, r10 + LSR r11, r11, #16 + BFI r3, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r4 + SMULWT r11, lr, r4 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r4, r4, r10 +#else + SBFX r10, r4, #0, #16 + SBFX r11, r4, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r4, r11, LSL #16 + SUB r4, r4, r10 + LSR r11, r11, #16 + BFI r4, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r5 + SMULWT r11, lr, r5 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r5, r5, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, r5, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r5, r11, LSL #16 + SUB r5, r5, r10 + LSR r11, r11, #16 + BFI r5, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STR r2, [r0] + STR r3, [r0, #4] + STR r4, [r0, #8] + STR r5, [r0, #12] + STR r6, [r0, #16] + STR r7, [r0, #20] + STR r8, [r0, #24] + STR r9, [r0, #28] + LDR r3, [sp, #4] + ADD r3, r3, #0x10 + RSBS r10, r3, #0x100 + ADD r0, r0, #0x20 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_invntt_loop_765 +#else + BNE.N L_kyber_thumb2_invntt_loop_765 +#endif + SUB r0, r0, #0x200 + MOV r3, #0x0 +L_kyber_thumb2_invntt_loop_4_j: + STR r3, [sp, #4] + ADD lr, r1, r3, LSR #4 + MOV r2, #0x4 + LDR lr, [lr, #224] +L_kyber_thumb2_invntt_loop_4_i: + STR r2, [sp] + LDR r2, [r0] + LDR r3, [r0, #16] + LDR r4, [r0, #32] + LDR r5, [r0, #48] + LDR r6, [r0, #64] + LDR r7, [r0, #80] + LDR r8, [r0, #96] + LDR r9, [r0, #112] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r2, r4 + SADD16 r2, r2, r4 + SMULBT r4, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r4 + SMLABB r4, r12, r11, r4 + PKHTB r4, r4, r10, ASR #16 +#else + SUB r11, r2, r4 + ADD r12, r2, r4 + BFC r4, #0, #16 + BFC r2, #0, #16 + SUB r10, r2, r4 + ADD r2, r2, r4 + BFI r10, r11, #0, #16 + BFI r2, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r4, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r4, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r4, r12, r11, r4 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r3, r5 + SADD16 r3, r3, r5 + SMULBT r5, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r5, r12, r11, r5 + PKHTB r5, r5, r10, ASR #16 +#else + SUB r11, r3, r5 + ADD r12, r3, r5 + BFC r5, #0, #16 + BFC r3, #0, #16 + SUB r10, r3, r5 + ADD r3, r3, r5 + BFI r10, r11, #0, #16 + BFI r3, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r5, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r5, r12, r11, r5 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r6, r8 + SADD16 r6, r6, r8 + SMULTT r8, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r8, r12, r11, r8 + PKHTB r8, r8, r10, ASR #16 +#else + SUB r11, r6, r8 + ADD r12, r6, r8 + BFC r8, #0, #16 + BFC r6, #0, #16 + SUB r10, r6, r8 + ADD r6, r6, r8 + BFI r10, r11, #0, #16 + BFI r6, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r8, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r8, r12, r11, r8 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r7, r9 + SADD16 r7, r7, r9 + SMULTT r9, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SUB r11, r7, r9 + ADD r12, r7, r9 + BFC r9, #0, #16 + BFC r7, #0, #16 + SUB r10, r7, r9 + ADD r7, r7, r9 + BFI r10, r11, #0, #16 + BFI r7, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r9, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STR r2, [r0] + STR r3, [r0, #16] + STR r4, [r0, #32] + STR r5, [r0, #48] + STR r6, [r0, #64] + STR r7, [r0, #80] + STR r8, [r0, #96] + STR r9, [r0, #112] + LDRD r2, r3, [sp] + SUBS r2, r2, #0x1 + ADD r0, r0, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_invntt_loop_4_i +#else + BNE.N L_kyber_thumb2_invntt_loop_4_i +#endif + ADD r3, r3, #0x40 + RSBS r10, r3, #0x100 + ADD r0, r0, #0x70 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_invntt_loop_4_j +#else + BNE.N L_kyber_thumb2_invntt_loop_4_j +#endif + SUB r0, r0, #0x200 + MOV r2, #0x10 +L_kyber_thumb2_invntt_loop_321: + STR r2, [sp] + LDRH lr, [r1, #2] + LDR r2, [r0] + LDR r3, [r0, #64] + LDR r4, [r0, #128] + LDR r5, [r0, #192] + LDR r6, [r0, #256] + LDR r7, [r0, #320] + LDR r8, [r0, #384] + LDR r9, [r0, #448] + LDR lr, [r1, #240] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r2, r3 + SADD16 r2, r2, r3 + SMULBT r3, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r3 + SMLABB r3, r12, r11, r3 + PKHTB r3, r3, r10, ASR #16 +#else + SUB r11, r2, r3 + ADD r12, r2, r3 + BFC r3, #0, #16 + BFC r2, #0, #16 + SUB r10, r2, r3 + ADD r2, r2, r3 + BFI r10, r11, #0, #16 + BFI r2, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r3, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r3, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r3, r12, r11, r3 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r4, r5 + SADD16 r4, r4, r5 + SMULTT r5, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r5, r12, r11, r5 + PKHTB r5, r5, r10, ASR #16 +#else + SUB r11, r4, r5 + ADD r12, r4, r5 + BFC r5, #0, #16 + BFC r4, #0, #16 + SUB r10, r4, r5 + ADD r4, r4, r5 + BFI r10, r11, #0, #16 + BFI r4, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r5, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r5, r12, r11, r5 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [r1, #244] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r6, r7 + SADD16 r6, r6, r7 + SMULBT r7, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r7, r12, r11, r7 + PKHTB r7, r7, r10, ASR #16 +#else + SUB r11, r6, r7 + ADD r12, r6, r7 + BFC r7, #0, #16 + BFC r6, #0, #16 + SUB r10, r6, r7 + ADD r6, r6, r7 + BFI r10, r11, #0, #16 + BFI r6, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r7, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r7, r12, r11, r7 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r8, r9 + SADD16 r8, r8, r9 + SMULTT r9, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SUB r11, r8, r9 + ADD r12, r8, r9 + BFC r9, #0, #16 + BFC r8, #0, #16 + SUB r10, r8, r9 + ADD r8, r8, r9 + BFI r10, r11, #0, #16 + BFI r8, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r9, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [r1, #248] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r2, r4 + SADD16 r2, r2, r4 + SMULBT r4, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r4 + SMLABB r4, r12, r11, r4 + PKHTB r4, r4, r10, ASR #16 +#else + SUB r11, r2, r4 + ADD r12, r2, r4 + BFC r4, #0, #16 + BFC r2, #0, #16 + SUB r10, r2, r4 + ADD r2, r2, r4 + BFI r10, r11, #0, #16 + BFI r2, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r4, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r4, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r4, r12, r11, r4 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r3, r5 + SADD16 r3, r3, r5 + SMULBT r5, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r5, r12, r11, r5 + PKHTB r5, r5, r10, ASR #16 +#else + SUB r11, r3, r5 + ADD r12, r3, r5 + BFC r5, #0, #16 + BFC r3, #0, #16 + SUB r10, r3, r5 + ADD r3, r3, r5 + BFI r10, r11, #0, #16 + BFI r3, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r5, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r5, r12, r11, r5 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r6, r8 + SADD16 r6, r6, r8 + SMULTT r8, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r8, r12, r11, r8 + PKHTB r8, r8, r10, ASR #16 +#else + SUB r11, r6, r8 + ADD r12, r6, r8 + BFC r8, #0, #16 + BFC r6, #0, #16 + SUB r10, r6, r8 + ADD r6, r6, r8 + BFI r10, r11, #0, #16 + BFI r6, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r8, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r8, r12, r11, r8 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r7, r9 + SADD16 r7, r7, r9 + SMULTT r9, lr, r10 + SMULTB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SUB r11, r7, r9 + ADD r12, r7, r9 + BFC r9, #0, #16 + BFC r7, #0, #16 + SUB r10, r7, r9 + ADD r7, r7, r9 + BFI r10, r11, #0, #16 + BFI r7, r12, #0, #16 + SBFX r11, lr, #16, #16 + ASR r12, r10, #16 + MUL r9, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + MOV lr, #0xafc0 + MOVT lr, #0x13 +#else + MOV lr, #0x4ebf +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r2 + SMULWT r11, lr, r2 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r2, r2, r10 +#else + SBFX r10, r2, #0, #16 + SBFX r11, r2, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r2, r11, LSL #16 + SUB r2, r2, r10 + LSR r11, r11, #16 + BFI r2, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r3 + SMULWT r11, lr, r3 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r3, r3, r10 +#else + SBFX r10, r3, #0, #16 + SBFX r11, r3, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r3, r11, LSL #16 + SUB r3, r3, r10 + LSR r11, r11, #16 + BFI r3, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r4 + SMULWT r11, lr, r4 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r4, r4, r10 +#else + SBFX r10, r4, #0, #16 + SBFX r11, r4, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r4, r11, LSL #16 + SUB r4, r4, r10 + LSR r11, r11, #16 + BFI r4, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULWB r10, lr, r5 + SMULWT r11, lr, r5 + SMULBT r10, r12, r10 + SMULBT r11, r12, r11 + PKHBT r10, r10, r11, LSL #16 + SSUB16 r5, r5, r10 +#else + SBFX r10, r5, #0, #16 + SBFX r11, r5, #16, #16 + MUL r10, lr, r10 + MUL r11, lr, r11 + ASR r10, r10, #26 + ASR r11, r11, #26 + MUL r10, r12, r10 + MUL r11, r12, r11 + SUB r11, r5, r11, LSL #16 + SUB r5, r5, r10 + LSR r11, r11, #16 + BFI r5, r11, #16, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [r1, #252] +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r2, r6 + SADD16 r2, r2, r6 + SMULBT r6, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r6 + SMLABB r6, r12, r11, r6 + PKHTB r6, r6, r10, ASR #16 +#else + SUB r11, r2, r6 + ADD r12, r2, r6 + BFC r6, #0, #16 + BFC r2, #0, #16 + SUB r10, r2, r6 + ADD r2, r2, r6 + BFI r10, r11, #0, #16 + BFI r2, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r6, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r6, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r6, r12, r11, r6 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r3, r7 + SADD16 r3, r3, r7 + SMULBT r7, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r7, r12, r11, r7 + PKHTB r7, r7, r10, ASR #16 +#else + SUB r11, r3, r7 + ADD r12, r3, r7 + BFC r7, #0, #16 + BFC r3, #0, #16 + SUB r10, r3, r7 + ADD r3, r3, r7 + BFI r10, r11, #0, #16 + BFI r3, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r7, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r7, r12, r11, r7 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r4, r8 + SADD16 r4, r4, r8 + SMULBT r8, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r8, r12, r11, r8 + PKHTB r8, r8, r10, ASR #16 +#else + SUB r11, r4, r8 + ADD r12, r4, r8 + BFC r8, #0, #16 + BFC r4, #0, #16 + SUB r10, r4, r8 + ADD r4, r4, r8 + BFI r10, r11, #0, #16 + BFI r4, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r8, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r8, r12, r11, r8 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r10, r5, r9 + SADD16 r5, r5, r9 + SMULBT r9, lr, r10 + SMULBB r10, lr, r10 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SUB r11, r5, r9 + ADD r12, r5, r9 + BFC r9, #0, #16 + BFC r5, #0, #16 + SUB r10, r5, r9 + ADD r5, r5, r9 + BFI r10, r11, #0, #16 + BFI r5, r12, #0, #16 + SBFX r11, lr, #0, #16 + ASR r12, r10, #16 + MUL r9, r11, r12 + SBFX r10, r10, #0, #16 + MUL r10, r11, r10 + MOV r12, #0xcff + SBFX r11, r10, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + LDR lr, [r1, #254] +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r2 + SMULBT r2, lr, r2 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r2 + SMLABB r2, r12, r11, r2 + PKHTB r2, r2, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r2, #0, #16 + MUL r10, r11, r10 + SBFX r2, r2, #16, #16 + MUL r2, r11, r2 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r2, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r2, r12, r11, r2 + BFI r2, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r3 + SMULBT r3, lr, r3 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r3 + SMLABB r3, r12, r11, r3 + PKHTB r3, r3, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r3, #0, #16 + MUL r10, r11, r10 + SBFX r3, r3, #16, #16 + MUL r3, r11, r3 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r3, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r3, r12, r11, r3 + BFI r3, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r4 + SMULBT r4, lr, r4 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r4 + SMLABB r4, r12, r11, r4 + PKHTB r4, r4, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r4, #0, #16 + MUL r10, r11, r10 + SBFX r4, r4, #16, #16 + MUL r4, r11, r4 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r4, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r4, r12, r11, r4 + BFI r4, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r5 + SMULBT r5, lr, r5 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r5 + SMLABB r5, r12, r11, r5 + PKHTB r5, r5, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r5, #0, #16 + MUL r10, r11, r10 + SBFX r5, r5, #16, #16 + MUL r5, r11, r5 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r5, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r5, r12, r11, r5 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r6 + SMULBT r6, lr, r6 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r6 + SMLABB r6, r12, r11, r6 + PKHTB r6, r6, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r6, #0, #16 + MUL r10, r11, r10 + SBFX r6, r6, #16, #16 + MUL r6, r11, r6 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r6, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r6, r12, r11, r6 + BFI r6, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r7 + SMULBT r7, lr, r7 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r7 + SMLABB r7, r12, r11, r7 + PKHTB r7, r7, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r7, #0, #16 + MUL r10, r11, r10 + SBFX r7, r7, #16, #16 + MUL r7, r11, r7 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r7, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r7, r12, r11, r7 + BFI r7, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r8 + SMULBT r8, lr, r8 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r8 + SMLABB r8, r12, r11, r8 + PKHTB r8, r8, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r8, #0, #16 + MUL r10, r11, r10 + SBFX r8, r8, #16, #16 + MUL r8, r11, r8 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r8, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r8, r12, r11, r8 + BFI r8, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + SMULBB r10, lr, r9 + SMULBT r9, lr, r9 + SMULTB r11, r12, r10 + SMLABB r10, r12, r11, r10 + SMULTB r11, r12, r9 + SMLABB r9, r12, r11, r9 + PKHTB r9, r9, r10, ASR #16 +#else + SBFX r11, lr, #0, #16 + SBFX r10, r9, #0, #16 + MUL r10, r11, r10 + SBFX r9, r9, #16, #16 + MUL r9, r11, r9 + MOV r12, #0xcff + MUL r11, r12, r10 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + MLA r10, r12, r11, r10 + MOV r12, #0xcff + SBFX r11, r9, #0, #16 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r11, r11, #0, #16 + LSR r10, r10, #16 + MLA r9, r12, r11, r9 + BFI r9, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STR r2, [r0] + STR r3, [r0, #64] + STR r4, [r0, #128] + STR r5, [r0, #192] + STR r6, [r0, #256] + STR r7, [r0, #320] + STR r8, [r0, #384] + STR r9, [r0, #448] + LDR r2, [sp] + SUBS r2, r2, #0x1 + ADD r0, r0, #0x4 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_invntt_loop_321 +#else + BNE.N L_kyber_thumb2_invntt_loop_321 +#endif + ADD sp, sp, #0x8 + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 1629 */ + .size kyber_thumb2_invntt,.-kyber_thumb2_invntt + .text + .type L_kyber_thumb2_basemul_mont_zetas, %object + .size L_kyber_thumb2_basemul_mont_zetas, 256 + .align 4 +L_kyber_thumb2_basemul_mont_zetas: + .short 0x8ed + .short 0xa0b + .short 0xb9a + .short 0x714 + .short 0x5d5 + .short 0x58e + .short 0x11f + .short 0xca + .short 0xc56 + .short 0x26e + .short 0x629 + .short 0xb6 + .short 0x3c2 + .short 0x84f + .short 0x73f + .short 0x5bc + .short 0x23d + .short 0x7d4 + .short 0x108 + .short 0x17f + .short 0x9c4 + .short 0x5b2 + .short 0x6bf + .short 0xc7f + .short 0xa58 + .short 0x3f9 + .short 0x2dc + .short 0x260 + .short 0x6fb + .short 0x19b + .short 0xc34 + .short 0x6de + .short 0x4c7 + .short 0x28c + .short 0xad9 + .short 0x3f7 + .short 0x7f4 + .short 0x5d3 + .short 0xbe7 + .short 0x6f9 + .short 0x204 + .short 0xcf9 + .short 0xbc1 + .short 0xa67 + .short 0x6af + .short 0x877 + .short 0x7e + .short 0x5bd + .short 0x9ac + .short 0xca7 + .short 0xbf2 + .short 0x33e + .short 0x6b + .short 0x774 + .short 0xc0a + .short 0x94a + .short 0xb73 + .short 0x3c1 + .short 0x71d + .short 0xa2c + .short 0x1c0 + .short 0x8d8 + .short 0x2a5 + .short 0x806 + .short 0x8b2 + .short 0x1ae + .short 0x22b + .short 0x34b + .short 0x81e + .short 0x367 + .short 0x60e + .short 0x69 + .short 0x1a6 + .short 0x24b + .short 0xb1 + .short 0xc16 + .short 0xbde + .short 0xb35 + .short 0x626 + .short 0x675 + .short 0xc0b + .short 0x30a + .short 0x487 + .short 0xc6e + .short 0x9f8 + .short 0x5cb + .short 0xaa7 + .short 0x45f + .short 0x6cb + .short 0x284 + .short 0x999 + .short 0x15d + .short 0x1a2 + .short 0x149 + .short 0xc65 + .short 0xcb6 + .short 0x331 + .short 0x449 + .short 0x25b + .short 0x262 + .short 0x52a + .short 0x7fc + .short 0x748 + .short 0x180 + .short 0x842 + .short 0xc79 + .short 0x4c2 + .short 0x7ca + .short 0x997 + .short 0xdc + .short 0x85e + .short 0x686 + .short 0x860 + .short 0x707 + .short 0x803 + .short 0x31a + .short 0x71b + .short 0x9ab + .short 0x99b + .short 0x1de + .short 0xc95 + .short 0xbcd + .short 0x3e4 + .short 0x3df + .short 0x3be + .short 0x74d + .short 0x5f2 + .short 0x65c + .text + .align 4 + .globl kyber_thumb2_basemul_mont + .type kyber_thumb2_basemul_mont, %function +kyber_thumb2_basemul_mont: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ADR r3, L_kyber_thumb2_basemul_mont_zetas + ADD r3, r3, #0x80 +#ifndef WOLFSSL_ARM_ARCH_7M + MOV r12, #0xd01 + MOVT r12, #0xcff +#endif /* !WOLFSSL_ARM_ARCH_7M */ + MOV r8, #0x0 +L_kyber_thumb2_basemul_mont_loop: + LDM r1!, {r4, r5} + LDM r2!, {r6, r7} + LDR lr, [r3, r8] + ADD r8, r8, #0x2 + PUSH {r8} + CMP r8, #0x80 +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTT r8, r4, r6 + SMULTT r10, r5, r7 + SMULTB r9, r12, r8 + SMULTB r11, r12, r10 + SMLABB r8, r12, r9, r8 + SMLABB r10, r12, r11, r10 + RSB r11, lr, #0x0 + SMULBT r8, lr, r8 + SMULBT r10, r11, r10 + SMLABB r8, r4, r6, r8 + SMLABB r10, r5, r7, r10 + SMULTB r9, r12, r8 + SMULTB r11, r12, r10 + SMLABB r8, r12, r9, r8 + SMLABB r10, r12, r11, r10 + SMULBT r9, r4, r6 + SMULBT r11, r5, r7 + SMLATB r9, r4, r6, r9 + SMLATB r11, r5, r7, r11 + SMULTB r6, r12, r9 + SMULTB r7, r12, r11 + SMLABB r9, r12, r6, r9 + SMLABB r11, r12, r7, r11 + PKHTB r4, r9, r8, ASR #16 + PKHTB r5, r11, r10, ASR #16 +#else + ASR r8, r4, #16 + ASR r10, r5, #16 + ASR r9, r6, #16 + ASR r11, r7, #16 + MUL r8, r8, r9 + MUL r10, r10, r11 + MOV r12, #0xcff + SBFX r9, r8, #0, #16 + SBFX r11, r10, #0, #16 + MUL r9, r12, r8 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r9, r9, #0, #16 + SBFX r11, r11, #0, #16 + MLA r8, r12, r9, r8 + MLA r10, r12, r11, r10 + RSB r11, lr, #0x0 + SBFX r9, lr, #0, #16 + SBFX r11, r11, #0, #16 + ASR r8, r8, #16 + ASR r10, r10, #16 + MUL r8, r9, r8 + MUL r10, r11, r10 + SBFX r9, r4, #0, #16 + SBFX r11, r5, #0, #16 + SBFX r12, r6, #0, #16 + MLA r8, r9, r12, r8 + SBFX r12, r7, #0, #16 + MLA r10, r11, r12, r10 + MOV r12, #0xcff + SBFX r9, r8, #0, #16 + SBFX r11, r10, #0, #16 + MUL r9, r12, r9 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r9, r9, #0, #16 + SBFX r11, r11, #0, #16 + MLA r8, r12, r9, r8 + MLA r10, r12, r11, r10 + SBFX r9, r4, #0, #16 + SBFX r11, r5, #0, #16 + ASR r12, r6, #16 + MUL r9, r9, r12 + ASR r12, r7, #16 + MUL r11, r11, r12 + ASR r4, r4, #16 + ASR r5, r5, #16 + SBFX r12, r6, #0, #16 + MLA r9, r4, r12, r9 + SBFX r12, r7, #0, #16 + MLA r11, r5, r12, r11 + MOV r12, #0xcff + SBFX r6, r9, #0, #16 + SBFX r7, r11, #0, #16 + MUL r6, r12, r6 + MUL r7, r12, r7 + MOV r12, #0xd01 + SBFX r4, r6, #0, #16 + SBFX r5, r7, #0, #16 + MLA r9, r12, r4, r9 + MLA r11, r12, r5, r11 + BFC r9, #0, #16 + BFC r11, #0, #16 + ORR r4, r9, r8, LSR #16 + ORR r5, r11, r10, LSR #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STM r0!, {r4, r5} + POP {r8} +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_basemul_mont_loop +#else + BNE.N L_kyber_thumb2_basemul_mont_loop +#endif + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 146 */ + .size kyber_thumb2_basemul_mont,.-kyber_thumb2_basemul_mont + .text + .align 4 + .globl kyber_thumb2_basemul_mont_add + .type kyber_thumb2_basemul_mont_add, %function +kyber_thumb2_basemul_mont_add: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ADR r3, L_kyber_thumb2_basemul_mont_zetas + ADD r3, r3, #0x80 +#ifndef WOLFSSL_ARM_ARCH_7M + MOV r12, #0xd01 + MOVT r12, #0xcff +#endif /* !WOLFSSL_ARM_ARCH_7M */ + MOV r8, #0x0 +L_kyber_thumb2_basemul_mont_add_loop: + LDM r1!, {r4, r5} + LDM r2!, {r6, r7} + LDR lr, [r3, r8] + ADD r8, r8, #0x2 + PUSH {r8} + CMP r8, #0x80 +#ifndef WOLFSSL_ARM_ARCH_7M + SMULTT r8, r4, r6 + SMULTT r10, r5, r7 + SMULTB r9, r12, r8 + SMULTB r11, r12, r10 + SMLABB r8, r12, r9, r8 + SMLABB r10, r12, r11, r10 + RSB r11, lr, #0x0 + SMULBT r8, lr, r8 + SMULBT r10, r11, r10 + SMLABB r8, r4, r6, r8 + SMLABB r10, r5, r7, r10 + SMULTB r9, r12, r8 + SMULTB r11, r12, r10 + SMLABB r8, r12, r9, r8 + SMLABB r10, r12, r11, r10 + SMULBT r9, r4, r6 + SMULBT r11, r5, r7 + SMLATB r9, r4, r6, r9 + SMLATB r11, r5, r7, r11 + SMULTB r6, r12, r9 + SMULTB r7, r12, r11 + SMLABB r9, r12, r6, r9 + SMLABB r11, r12, r7, r11 + LDM r0, {r4, r5} + PKHTB r9, r9, r8, ASR #16 + PKHTB r11, r11, r10, ASR #16 + SADD16 r4, r4, r9 + SADD16 r5, r5, r11 +#else + ASR r8, r4, #16 + ASR r10, r5, #16 + ASR r9, r6, #16 + ASR r11, r7, #16 + MUL r8, r8, r9 + MUL r10, r10, r11 + MOV r12, #0xcff + SBFX r9, r8, #0, #16 + SBFX r11, r10, #0, #16 + MUL r9, r12, r8 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r9, r9, #0, #16 + SBFX r11, r11, #0, #16 + MLA r8, r12, r9, r8 + MLA r10, r12, r11, r10 + RSB r11, lr, #0x0 + SBFX r9, lr, #0, #16 + SBFX r11, r11, #0, #16 + ASR r8, r8, #16 + ASR r10, r10, #16 + MUL r8, r9, r8 + MUL r10, r11, r10 + SBFX r9, r4, #0, #16 + SBFX r11, r5, #0, #16 + SBFX r12, r6, #0, #16 + MLA r8, r9, r12, r8 + SBFX r12, r7, #0, #16 + MLA r10, r11, r12, r10 + MOV r12, #0xcff + SBFX r9, r8, #0, #16 + SBFX r11, r10, #0, #16 + MUL r9, r12, r9 + MUL r11, r12, r11 + MOV r12, #0xd01 + SBFX r9, r9, #0, #16 + SBFX r11, r11, #0, #16 + MLA r8, r12, r9, r8 + MLA r10, r12, r11, r10 + SBFX r9, r4, #0, #16 + SBFX r11, r5, #0, #16 + ASR r12, r6, #16 + MUL r9, r9, r12 + ASR r12, r7, #16 + MUL r11, r11, r12 + ASR r4, r4, #16 + ASR r5, r5, #16 + SBFX r12, r6, #0, #16 + MLA r9, r4, r12, r9 + SBFX r12, r7, #0, #16 + MLA r11, r5, r12, r11 + MOV r12, #0xcff + SBFX r6, r9, #0, #16 + SBFX r7, r11, #0, #16 + MUL r6, r12, r6 + MUL r7, r12, r7 + MOV r12, #0xd01 + SBFX r4, r6, #0, #16 + SBFX r5, r7, #0, #16 + MLA r9, r12, r4, r9 + MLA r11, r12, r5, r11 + LDM r0, {r4, r5} + BFC r9, #0, #16 + BFC r11, #0, #16 + ORR r9, r9, r8, LSR #16 + ORR r11, r11, r10, LSR #16 + ADD r8, r4, r9 + ADD r10, r5, r11 + BFC r9, #0, #16 + BFC r11, #0, #16 + ADD r4, r4, r9 + ADD r5, r5, r11 + BFI r4, r8, #0, #16 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STM r0!, {r4, r5} + POP {r8} +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_basemul_mont_add_loop +#else + BNE.N L_kyber_thumb2_basemul_mont_add_loop +#endif + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 162 */ + .size kyber_thumb2_basemul_mont_add,.-kyber_thumb2_basemul_mont_add + .text + .align 4 + .globl kyber_thumb2_csubq + .type kyber_thumb2_csubq, %function +kyber_thumb2_csubq: + PUSH {r4, r5, r6, r7, r8, r9, r10, r11, lr} + MOV r11, #0xd01 + MOV r12, #0xd01 +#ifndef WOLFSSL_ARM_ARCH_7M + MOVT r12, #0xd01 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + MOV lr, #0x8000 + MOVT lr, #0x8000 + MOV r1, #0x100 +L_kyber_thumb2_csubq_loop: + LDM r0, {r2, r3, r4, r5} +#ifndef WOLFSSL_ARM_ARCH_7M + SSUB16 r2, r2, r12 + SSUB16 r3, r3, r12 + SSUB16 r4, r4, r12 + SSUB16 r5, r5, r12 + AND r6, r2, lr + AND r7, r3, lr + AND r8, r4, lr + AND r9, r5, lr + LSR r6, r6, #15 + LSR r7, r7, #15 + LSR r8, r8, #15 + LSR r9, r9, #15 + MUL r6, r6, r11 + MUL r7, r7, r11 + MUL r8, r8, r11 + MUL r9, r9, r11 + SADD16 r2, r2, r6 + SADD16 r3, r3, r7 + SADD16 r4, r4, r8 + SADD16 r5, r5, r9 +#else + SUB r6, r2, r12 + SUB r2, r2, r12, LSL #16 + BFI r2, r6, #0, #16 + SUB r7, r3, r12 + SUB r3, r3, r12, LSL #16 + BFI r3, r7, #0, #16 + SUB r8, r4, r12 + SUB r4, r4, r12, LSL #16 + BFI r4, r8, #0, #16 + SUB r9, r5, r12 + SUB r5, r5, r12, LSL #16 + BFI r5, r9, #0, #16 + AND r6, r2, lr + AND r7, r3, lr + AND r8, r4, lr + AND r9, r5, lr + LSR r6, r6, #15 + LSR r7, r7, #15 + LSR r8, r8, #15 + LSR r9, r9, #15 + MUL r6, r6, r11 + MUL r7, r7, r11 + MUL r8, r8, r11 + MUL r9, r9, r11 + ADD r10, r2, r6 + BFC r6, #0, #16 + ADD r2, r2, r6 + BFI r2, r10, #0, #16 + ADD r10, r3, r7 + BFC r7, #0, #16 + ADD r3, r3, r7 + BFI r3, r10, #0, #16 + ADD r10, r4, r8 + BFC r8, #0, #16 + ADD r4, r4, r8 + BFI r4, r10, #0, #16 + ADD r10, r5, r9 + BFC r9, #0, #16 + ADD r5, r5, r9 + BFI r5, r10, #0, #16 +#endif /* !WOLFSSL_ARM_ARCH_7M */ + STM r0!, {r2, r3, r4, r5} + SUBS r1, r1, #0x8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_csubq_loop +#else + BNE.N L_kyber_thumb2_csubq_loop +#endif + POP {r4, r5, r6, r7, r8, r9, r10, r11, pc} + /* Cycle Count = 101 */ + .size kyber_thumb2_csubq,.-kyber_thumb2_csubq + .text + .align 4 + .globl kyber_thumb2_rej_uniform + .type kyber_thumb2_rej_uniform, %function +kyber_thumb2_rej_uniform: + PUSH {r4, r5, r6, r7, r8, r9, r10, lr} + MOV r8, #0xd01 + MOV r9, #0x0 +L_kyber_thumb2_rej_uniform_loop_no_fail: + CMP r1, #0x8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BLT L_kyber_thumb2_rej_uniform_done_no_fail +#else + BLT.N L_kyber_thumb2_rej_uniform_done_no_fail +#endif + LDM r2!, {r4, r5, r6} + UBFX r7, r4, #0, #12 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + UBFX r7, r4, #12, #12 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + UBFX r7, r4, #24, #8 + BFI r7, r5, #8, #4 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + UBFX r7, r5, #4, #12 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + UBFX r7, r5, #16, #12 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + UBFX r7, r5, #28, #4 + BFI r7, r6, #4, #8 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + UBFX r7, r6, #8, #12 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + UBFX r7, r6, #20, #12 + STRH r7, [r0, r9] + SUB r10, r7, r8 + LSR r10, r10, #31 + SUB r1, r1, r10 + ADD r9, r9, r10, LSL #1 + SUBS r3, r3, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BNE L_kyber_thumb2_rej_uniform_loop_no_fail +#else + BNE.N L_kyber_thumb2_rej_uniform_loop_no_fail +#endif +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + B L_kyber_thumb2_rej_uniform_done +#else + B.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_done_no_fail: + CMP r1, #0x0 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_loop: + LDM r2!, {r4, r5, r6} + UBFX r7, r4, #0, #12 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_0 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_0 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_0: + UBFX r7, r4, #12, #12 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_1 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_1 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_1: + UBFX r7, r4, #24, #8 + BFI r7, r5, #8, #4 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_2 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_2 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_2: + UBFX r7, r5, #4, #12 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_3 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_3 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_3: + UBFX r7, r5, #16, #12 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_4 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_4 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_4: + UBFX r7, r5, #28, #4 + BFI r7, r6, #4, #8 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_5 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_5 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_5: + UBFX r7, r6, #8, #12 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_6 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_6 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_6: + UBFX r7, r6, #20, #12 + CMP r7, r8 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGE L_kyber_thumb2_rej_uniform_fail_7 +#else + BGE.N L_kyber_thumb2_rej_uniform_fail_7 +#endif + STRH r7, [r0, r9] + SUBS r1, r1, #0x1 + ADD r9, r9, #0x2 +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BEQ L_kyber_thumb2_rej_uniform_done +#else + BEQ.N L_kyber_thumb2_rej_uniform_done +#endif +L_kyber_thumb2_rej_uniform_fail_7: + SUBS r3, r3, #0xc +#if defined(__GNUC__) || defined(__ICCARM__) || defined(__IAR_SYSTEMS_ICC__) + BGT L_kyber_thumb2_rej_uniform_loop +#else + BGT.N L_kyber_thumb2_rej_uniform_loop +#endif +L_kyber_thumb2_rej_uniform_done: + LSR r0, r9, #1 + POP {r4, r5, r6, r7, r8, r9, r10, pc} + /* Cycle Count = 225 */ + .size kyber_thumb2_rej_uniform,.-kyber_thumb2_rej_uniform +#endif /* WOLFSSL_WC_KYBER */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif +#endif /* !WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-kyber-asm_c.c b/wolfcrypt/src/port/arm/thumb2-kyber-asm_c.c new file mode 100644 index 000000000..5c0895779 --- /dev/null +++ b/wolfcrypt/src/port/arm/thumb2-kyber-asm_c.c @@ -0,0 +1,3851 @@ +/* thumb2-kyber-asm + * + * Copyright (C) 2006-2024 wolfSSL Inc. + * + * This file is part of wolfSSL. + * + * wolfSSL is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * wolfSSL is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA + */ + +/* Generated using (from wolfssl): + * cd ../scripts + * ruby ./kyber/kyber.rb thumb2 ../wolfssl/wolfcrypt/src/port/arm/thumb2-kyber-asm.c + */ + +#ifdef HAVE_CONFIG_H + #include +#endif /* HAVE_CONFIG_H */ +#include +#include + +#ifdef WOLFSSL_ARMASM +#if !defined(__aarch64__) && defined(__thumb__) +#ifdef WOLFSSL_ARMASM_INLINE + +#ifdef __IAR_SYSTEMS_ICC__ +#define __asm__ asm +#define __volatile__ volatile +#define WOLFSSL_NO_VAR_ASSIGN_REG +#endif /* __IAR_SYSTEMS_ICC__ */ +#ifdef __KEIL__ +#define __asm__ __asm +#define __volatile__ volatile +#endif /* __KEIL__ */ +#include + +#ifdef WOLFSSL_WC_KYBER +XALIGNED(16) static const uint16_t L_kyber_thumb2_ntt_zetas[] = { + 0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca, + 0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc, + 0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f, + 0x0a58, 0x03f9, 0x02dc, 0x0260, 0x06fb, 0x019b, 0x0c34, 0x06de, + 0x04c7, 0x028c, 0x0ad9, 0x03f7, 0x07f4, 0x05d3, 0x0be7, 0x06f9, + 0x0204, 0x0cf9, 0x0bc1, 0x0a67, 0x06af, 0x0877, 0x007e, 0x05bd, + 0x09ac, 0x0ca7, 0x0bf2, 0x033e, 0x006b, 0x0774, 0x0c0a, 0x094a, + 0x0b73, 0x03c1, 0x071d, 0x0a2c, 0x01c0, 0x08d8, 0x02a5, 0x0806, + 0x08b2, 0x01ae, 0x022b, 0x034b, 0x081e, 0x0367, 0x060e, 0x0069, + 0x01a6, 0x024b, 0x00b1, 0x0c16, 0x0bde, 0x0b35, 0x0626, 0x0675, + 0x0c0b, 0x030a, 0x0487, 0x0c6e, 0x09f8, 0x05cb, 0x0aa7, 0x045f, + 0x06cb, 0x0284, 0x0999, 0x015d, 0x01a2, 0x0149, 0x0c65, 0x0cb6, + 0x0331, 0x0449, 0x025b, 0x0262, 0x052a, 0x07fc, 0x0748, 0x0180, + 0x0842, 0x0c79, 0x04c2, 0x07ca, 0x0997, 0x00dc, 0x085e, 0x0686, + 0x0860, 0x0707, 0x0803, 0x031a, 0x071b, 0x09ab, 0x099b, 0x01de, + 0x0c95, 0x0bcd, 0x03e4, 0x03df, 0x03be, 0x074d, 0x05f2, 0x065c, +}; + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void kyber_thumb2_ntt(sword16* r_p) +#else +void kyber_thumb2_ntt(sword16* r) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register sword16* r __asm__ ("r0") = (sword16*)r_p; + register uint16_t* L_kyber_thumb2_ntt_zetas_c __asm__ ("r1") = (uint16_t*)&L_kyber_thumb2_ntt_zetas; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "SUB sp, sp, #0x8\n\t" + "MOV r1, %[L_kyber_thumb2_ntt_zetas]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV r12, #0xd01\n\t" + "MOVT r12, #0xcff\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "MOV r2, #0x10\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_ntt_loop_123:\n\t" +#else + "L_kyber_thumb2_ntt_loop_123_%=:\n\t" +#endif + "STR r2, [sp]\n\t" + "LDRH lr, [r1, #2]\n\t" + "LDR r2, [%[r]]\n\t" + "LDR r3, [%[r], #64]\n\t" + "LDR r4, [%[r], #128]\n\t" + "LDR r5, [%[r], #192]\n\t" + "LDR r6, [%[r], #256]\n\t" + "LDR r7, [%[r], #320]\n\t" + "LDR r8, [%[r], #384]\n\t" + "LDR r9, [%[r], #448]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r6\n\t" + "SMULBT r6, lr, r6\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r6\n\t" + "SMLABB r11, r12, r11, r6\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r6, r2, r10\n\t" + "SADD16 r2, r2, r10\n\t" +#else + "SBFX r10, r6, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r6, r6, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r6, r11, r6\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r6, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r6\n\t" + "SUB r6, r2, r11\n\t" + "ADD r2, r2, r11\n\t" + "SUB r11, r2, r10, LSR #16\n\t" + "ADD r10, r2, r10, LSR #16\n\t" + "BFI r6, r11, #0, #16\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r7\n\t" + "SMULBT r7, lr, r7\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r11, r12, r11, r7\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r7, r3, r10\n\t" + "SADD16 r3, r3, r10\n\t" +#else + "SBFX r10, r7, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r7, r7, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r7, r11, r7\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r7\n\t" + "SUB r7, r3, r11\n\t" + "ADD r3, r3, r11\n\t" + "SUB r11, r3, r10, LSR #16\n\t" + "ADD r10, r3, r10, LSR #16\n\t" + "BFI r7, r11, #0, #16\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r8\n\t" + "SMULBT r8, lr, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r11, r12, r11, r8\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r8, r4, r10\n\t" + "SADD16 r4, r4, r10\n\t" +#else + "SBFX r10, r8, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r8, r8, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r8, r11, r8\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r8\n\t" + "SUB r8, r4, r11\n\t" + "ADD r4, r4, r11\n\t" + "SUB r11, r4, r10, LSR #16\n\t" + "ADD r10, r4, r10, LSR #16\n\t" + "BFI r8, r11, #0, #16\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r9\n\t" + "SMULBT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r11, r12, r11, r9\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r9, r5, r10\n\t" + "SADD16 r5, r5, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r9, r9, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r9\n\t" + "SUB r9, r5, r11\n\t" + "ADD r5, r5, r11\n\t" + "SUB r11, r5, r10, LSR #16\n\t" + "ADD r10, r5, r10, LSR #16\n\t" + "BFI r9, r11, #0, #16\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [r1, #4]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r4\n\t" + "SMULBT r4, lr, r4\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r4\n\t" + "SMLABB r11, r12, r11, r4\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r4, r2, r10\n\t" + "SADD16 r2, r2, r10\n\t" +#else + "SBFX r10, r4, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r4, r4, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r4, r11, r4\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r4, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r4\n\t" + "SUB r4, r2, r11\n\t" + "ADD r2, r2, r11\n\t" + "SUB r11, r2, r10, LSR #16\n\t" + "ADD r10, r2, r10, LSR #16\n\t" + "BFI r4, r11, #0, #16\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r5\n\t" + "SMULBT r5, lr, r5\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r11, r12, r11, r5\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r5, r3, r10\n\t" + "SADD16 r3, r3, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r5, r5, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r5, r11, r5\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r5\n\t" + "SUB r5, r3, r11\n\t" + "ADD r3, r3, r11\n\t" + "SUB r11, r3, r10, LSR #16\n\t" + "ADD r10, r3, r10, LSR #16\n\t" + "BFI r5, r11, #0, #16\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r8\n\t" + "SMULTT r8, lr, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r11, r12, r11, r8\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r8, r6, r10\n\t" + "SADD16 r6, r6, r10\n\t" +#else + "SBFX r10, r8, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r8, r8, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r8, r11, r8\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r8\n\t" + "SUB r8, r6, r11\n\t" + "ADD r6, r6, r11\n\t" + "SUB r11, r6, r10, LSR #16\n\t" + "ADD r10, r6, r10, LSR #16\n\t" + "BFI r8, r11, #0, #16\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r9\n\t" + "SMULTT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r11, r12, r11, r9\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r9, r7, r10\n\t" + "SADD16 r7, r7, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r9, r9, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r9\n\t" + "SUB r9, r7, r11\n\t" + "ADD r7, r7, r11\n\t" + "SUB r11, r7, r10, LSR #16\n\t" + "ADD r10, r7, r10, LSR #16\n\t" + "BFI r9, r11, #0, #16\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [r1, #8]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r3\n\t" + "SMULBT r3, lr, r3\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r3\n\t" + "SMLABB r11, r12, r11, r3\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r3, r2, r10\n\t" + "SADD16 r2, r2, r10\n\t" +#else + "SBFX r10, r3, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r3, r3, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r3, r11, r3\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r3, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r3\n\t" + "SUB r3, r2, r11\n\t" + "ADD r2, r2, r11\n\t" + "SUB r11, r2, r10, LSR #16\n\t" + "ADD r10, r2, r10, LSR #16\n\t" + "BFI r3, r11, #0, #16\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r5\n\t" + "SMULTT r5, lr, r5\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r11, r12, r11, r5\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r5, r4, r10\n\t" + "SADD16 r4, r4, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r5, r5, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r5, r11, r5\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r5\n\t" + "SUB r5, r4, r11\n\t" + "ADD r4, r4, r11\n\t" + "SUB r11, r4, r10, LSR #16\n\t" + "ADD r10, r4, r10, LSR #16\n\t" + "BFI r5, r11, #0, #16\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [r1, #12]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r7\n\t" + "SMULBT r7, lr, r7\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r11, r12, r11, r7\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r7, r6, r10\n\t" + "SADD16 r6, r6, r10\n\t" +#else + "SBFX r10, r7, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r7, r7, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r7, r11, r7\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r7\n\t" + "SUB r7, r6, r11\n\t" + "ADD r6, r6, r11\n\t" + "SUB r11, r6, r10, LSR #16\n\t" + "ADD r10, r6, r10, LSR #16\n\t" + "BFI r7, r11, #0, #16\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r9\n\t" + "SMULTT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r11, r12, r11, r9\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r9, r8, r10\n\t" + "SADD16 r8, r8, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r9, r9, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r9\n\t" + "SUB r9, r8, r11\n\t" + "ADD r8, r8, r11\n\t" + "SUB r11, r8, r10, LSR #16\n\t" + "ADD r10, r8, r10, LSR #16\n\t" + "BFI r9, r11, #0, #16\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STR r2, [%[r]]\n\t" + "STR r3, [%[r], #64]\n\t" + "STR r4, [%[r], #128]\n\t" + "STR r5, [%[r], #192]\n\t" + "STR r6, [%[r], #256]\n\t" + "STR r7, [%[r], #320]\n\t" + "STR r8, [%[r], #384]\n\t" + "STR r9, [%[r], #448]\n\t" + "LDR r2, [sp]\n\t" + "SUBS r2, r2, #0x1\n\t" + "ADD %[r], %[r], #0x4\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_ntt_loop_123_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_ntt_loop_123\n\t" +#else + "BNE.N L_kyber_thumb2_ntt_loop_123_%=\n\t" +#endif + "SUB %[r], %[r], #0x40\n\t" + "MOV r3, #0x0\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_ntt_loop_4_j:\n\t" +#else + "L_kyber_thumb2_ntt_loop_4_j_%=:\n\t" +#endif + "STR r3, [sp, #4]\n\t" + "ADD lr, r1, r3, LSR #4\n\t" + "MOV r2, #0x4\n\t" + "LDR lr, [lr, #16]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_ntt_loop_4_i:\n\t" +#else + "L_kyber_thumb2_ntt_loop_4_i_%=:\n\t" +#endif + "STR r2, [sp]\n\t" + "LDR r2, [%[r]]\n\t" + "LDR r3, [%[r], #16]\n\t" + "LDR r4, [%[r], #32]\n\t" + "LDR r5, [%[r], #48]\n\t" + "LDR r6, [%[r], #64]\n\t" + "LDR r7, [%[r], #80]\n\t" + "LDR r8, [%[r], #96]\n\t" + "LDR r9, [%[r], #112]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r4\n\t" + "SMULBT r4, lr, r4\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r4\n\t" + "SMLABB r11, r12, r11, r4\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r4, r2, r10\n\t" + "SADD16 r2, r2, r10\n\t" +#else + "SBFX r10, r4, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r4, r4, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r4, r11, r4\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r4, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r4\n\t" + "SUB r4, r2, r11\n\t" + "ADD r2, r2, r11\n\t" + "SUB r11, r2, r10, LSR #16\n\t" + "ADD r10, r2, r10, LSR #16\n\t" + "BFI r4, r11, #0, #16\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r5\n\t" + "SMULBT r5, lr, r5\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r11, r12, r11, r5\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r5, r3, r10\n\t" + "SADD16 r3, r3, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r5, r5, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r5, r11, r5\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r5\n\t" + "SUB r5, r3, r11\n\t" + "ADD r3, r3, r11\n\t" + "SUB r11, r3, r10, LSR #16\n\t" + "ADD r10, r3, r10, LSR #16\n\t" + "BFI r5, r11, #0, #16\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r8\n\t" + "SMULTT r8, lr, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r11, r12, r11, r8\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r8, r6, r10\n\t" + "SADD16 r6, r6, r10\n\t" +#else + "SBFX r10, r8, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r8, r8, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r8, r11, r8\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r8\n\t" + "SUB r8, r6, r11\n\t" + "ADD r6, r6, r11\n\t" + "SUB r11, r6, r10, LSR #16\n\t" + "ADD r10, r6, r10, LSR #16\n\t" + "BFI r8, r11, #0, #16\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r9\n\t" + "SMULTT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r11, r12, r11, r9\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r9, r7, r10\n\t" + "SADD16 r7, r7, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r9, r9, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r9\n\t" + "SUB r9, r7, r11\n\t" + "ADD r7, r7, r11\n\t" + "SUB r11, r7, r10, LSR #16\n\t" + "ADD r10, r7, r10, LSR #16\n\t" + "BFI r9, r11, #0, #16\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STR r2, [%[r]]\n\t" + "STR r3, [%[r], #16]\n\t" + "STR r4, [%[r], #32]\n\t" + "STR r5, [%[r], #48]\n\t" + "STR r6, [%[r], #64]\n\t" + "STR r7, [%[r], #80]\n\t" + "STR r8, [%[r], #96]\n\t" + "STR r9, [%[r], #112]\n\t" + "LDRD r2, r3, [sp]\n\t" + "SUBS r2, r2, #0x1\n\t" + "ADD %[r], %[r], #0x4\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_ntt_loop_4_i_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_ntt_loop_4_i\n\t" +#else + "BNE.N L_kyber_thumb2_ntt_loop_4_i_%=\n\t" +#endif + "ADD r3, r3, #0x40\n\t" + "RSBS r10, r3, #0x100\n\t" + "ADD %[r], %[r], #0x70\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_ntt_loop_4_j_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_ntt_loop_4_j\n\t" +#else + "BNE.N L_kyber_thumb2_ntt_loop_4_j_%=\n\t" +#endif + "SUB %[r], %[r], #0x200\n\t" + "MOV r3, #0x0\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_ntt_loop_567:\n\t" +#else + "L_kyber_thumb2_ntt_loop_567_%=:\n\t" +#endif + "ADD lr, r1, r3, LSR #3\n\t" + "STR r3, [sp, #4]\n\t" + "LDRH lr, [lr, #32]\n\t" + "LDR r2, [%[r]]\n\t" + "LDR r3, [%[r], #4]\n\t" + "LDR r4, [%[r], #8]\n\t" + "LDR r5, [%[r], #12]\n\t" + "LDR r6, [%[r], #16]\n\t" + "LDR r7, [%[r], #20]\n\t" + "LDR r8, [%[r], #24]\n\t" + "LDR r9, [%[r], #28]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r6\n\t" + "SMULBT r6, lr, r6\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r6\n\t" + "SMLABB r11, r12, r11, r6\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r6, r2, r10\n\t" + "SADD16 r2, r2, r10\n\t" +#else + "SBFX r10, r6, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r6, r6, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r6, r11, r6\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r6, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r6\n\t" + "SUB r6, r2, r11\n\t" + "ADD r2, r2, r11\n\t" + "SUB r11, r2, r10, LSR #16\n\t" + "ADD r10, r2, r10, LSR #16\n\t" + "BFI r6, r11, #0, #16\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r7\n\t" + "SMULBT r7, lr, r7\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r11, r12, r11, r7\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r7, r3, r10\n\t" + "SADD16 r3, r3, r10\n\t" +#else + "SBFX r10, r7, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r7, r7, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r7, r11, r7\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r7\n\t" + "SUB r7, r3, r11\n\t" + "ADD r3, r3, r11\n\t" + "SUB r11, r3, r10, LSR #16\n\t" + "ADD r10, r3, r10, LSR #16\n\t" + "BFI r7, r11, #0, #16\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r8\n\t" + "SMULBT r8, lr, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r11, r12, r11, r8\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r8, r4, r10\n\t" + "SADD16 r4, r4, r10\n\t" +#else + "SBFX r10, r8, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r8, r8, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r8, r11, r8\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r8\n\t" + "SUB r8, r4, r11\n\t" + "ADD r4, r4, r11\n\t" + "SUB r11, r4, r10, LSR #16\n\t" + "ADD r10, r4, r10, LSR #16\n\t" + "BFI r8, r11, #0, #16\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r9\n\t" + "SMULBT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r11, r12, r11, r9\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r9, r5, r10\n\t" + "SADD16 r5, r5, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r9, r9, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r9\n\t" + "SUB r9, r5, r11\n\t" + "ADD r5, r5, r11\n\t" + "SUB r11, r5, r10, LSR #16\n\t" + "ADD r10, r5, r10, LSR #16\n\t" + "BFI r9, r11, #0, #16\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [sp, #4]\n\t" + "ADD lr, r1, lr, LSR #2\n\t" + "LDR lr, [lr, #64]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r4\n\t" + "SMULBT r4, lr, r4\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r4\n\t" + "SMLABB r11, r12, r11, r4\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r4, r2, r10\n\t" + "SADD16 r2, r2, r10\n\t" +#else + "SBFX r10, r4, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r4, r4, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r4, r11, r4\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r4, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r4\n\t" + "SUB r4, r2, r11\n\t" + "ADD r2, r2, r11\n\t" + "SUB r11, r2, r10, LSR #16\n\t" + "ADD r10, r2, r10, LSR #16\n\t" + "BFI r4, r11, #0, #16\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r5\n\t" + "SMULBT r5, lr, r5\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r11, r12, r11, r5\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r5, r3, r10\n\t" + "SADD16 r3, r3, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r5, r5, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r5, r11, r5\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r5\n\t" + "SUB r5, r3, r11\n\t" + "ADD r3, r3, r11\n\t" + "SUB r11, r3, r10, LSR #16\n\t" + "ADD r10, r3, r10, LSR #16\n\t" + "BFI r5, r11, #0, #16\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r8\n\t" + "SMULTT r8, lr, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r11, r12, r11, r8\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r8, r6, r10\n\t" + "SADD16 r6, r6, r10\n\t" +#else + "SBFX r10, r8, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r8, r8, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r8, r11, r8\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r8\n\t" + "SUB r8, r6, r11\n\t" + "ADD r6, r6, r11\n\t" + "SUB r11, r6, r10, LSR #16\n\t" + "ADD r10, r6, r10, LSR #16\n\t" + "BFI r8, r11, #0, #16\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r9\n\t" + "SMULTT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r11, r12, r11, r9\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r9, r7, r10\n\t" + "SADD16 r7, r7, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r9, r9, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r9\n\t" + "SUB r9, r7, r11\n\t" + "ADD r7, r7, r11\n\t" + "SUB r11, r7, r10, LSR #16\n\t" + "ADD r10, r7, r10, LSR #16\n\t" + "BFI r9, r11, #0, #16\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [sp, #4]\n\t" + "ADD lr, r1, lr, LSR #1\n\t" + "LDR lr, [lr, #128]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r3\n\t" + "SMULBT r3, lr, r3\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r3\n\t" + "SMLABB r11, r12, r11, r3\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r3, r2, r10\n\t" + "SADD16 r2, r2, r10\n\t" +#else + "SBFX r10, r3, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r3, r3, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r3, r11, r3\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r3, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r3\n\t" + "SUB r3, r2, r11\n\t" + "ADD r2, r2, r11\n\t" + "SUB r11, r2, r10, LSR #16\n\t" + "ADD r10, r2, r10, LSR #16\n\t" + "BFI r3, r11, #0, #16\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r5\n\t" + "SMULTT r5, lr, r5\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r11, r12, r11, r5\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r5, r4, r10\n\t" + "SADD16 r4, r4, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r5, r5, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r5, r11, r5\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r5\n\t" + "SUB r5, r4, r11\n\t" + "ADD r4, r4, r11\n\t" + "SUB r11, r4, r10, LSR #16\n\t" + "ADD r10, r4, r10, LSR #16\n\t" + "BFI r5, r11, #0, #16\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [sp, #4]\n\t" + "ADD lr, r1, lr, LSR #1\n\t" + "LDR lr, [lr, #132]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r7\n\t" + "SMULBT r7, lr, r7\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r11, r12, r11, r7\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r7, r6, r10\n\t" + "SADD16 r6, r6, r10\n\t" +#else + "SBFX r10, r7, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r7, r7, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r7, r11, r7\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r7\n\t" + "SUB r7, r6, r11\n\t" + "ADD r6, r6, r11\n\t" + "SUB r11, r6, r10, LSR #16\n\t" + "ADD r10, r6, r10, LSR #16\n\t" + "BFI r7, r11, #0, #16\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTB r10, lr, r9\n\t" + "SMULTT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r11, r12, r11, r9\n\t" + "PKHTB r10, r11, r10, ASR #16\n\t" + "SSUB16 r9, r8, r10\n\t" + "SADD16 r8, r8, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r9, r9, #16\n\t" + "MUL r10, r11, r10\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r11, r12, r11, r9\n\t" + "SUB r9, r8, r11\n\t" + "ADD r8, r8, r11\n\t" + "SUB r11, r8, r10, LSR #16\n\t" + "ADD r10, r8, r10, LSR #16\n\t" + "BFI r9, r11, #0, #16\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV lr, #0xafc0\n\t" + "MOVT lr, #0x13\n\t" +#else + "MOV lr, #0x4ebf\n\t" + "MOV r12, #0xd01\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r2\n\t" + "SMULWT r11, lr, r2\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r2, r2, r10\n\t" +#else + "SBFX r10, r2, #0, #16\n\t" + "SBFX r11, r2, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r2, r11, LSL #16\n\t" + "SUB r2, r2, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r2, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r3\n\t" + "SMULWT r11, lr, r3\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r3, r3, r10\n\t" +#else + "SBFX r10, r3, #0, #16\n\t" + "SBFX r11, r3, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r3, r11, LSL #16\n\t" + "SUB r3, r3, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r3, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r4\n\t" + "SMULWT r11, lr, r4\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r4, r4, r10\n\t" +#else + "SBFX r10, r4, #0, #16\n\t" + "SBFX r11, r4, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r4, r11, LSL #16\n\t" + "SUB r4, r4, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r4, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r5\n\t" + "SMULWT r11, lr, r5\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r5, r5, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, r5, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r5, r11, LSL #16\n\t" + "SUB r5, r5, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r5, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r6\n\t" + "SMULWT r11, lr, r6\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r6, r6, r10\n\t" +#else + "SBFX r10, r6, #0, #16\n\t" + "SBFX r11, r6, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r6, r11, LSL #16\n\t" + "SUB r6, r6, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r6, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r7\n\t" + "SMULWT r11, lr, r7\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r7, r7, r10\n\t" +#else + "SBFX r10, r7, #0, #16\n\t" + "SBFX r11, r7, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r7, r11, LSL #16\n\t" + "SUB r7, r7, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r7, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r8\n\t" + "SMULWT r11, lr, r8\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r8, r8, r10\n\t" +#else + "SBFX r10, r8, #0, #16\n\t" + "SBFX r11, r8, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r8, r11, LSL #16\n\t" + "SUB r8, r8, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r8, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r9\n\t" + "SMULWT r11, lr, r9\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r9, r9, r10\n\t" +#else + "SBFX r10, r9, #0, #16\n\t" + "SBFX r11, r9, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r9, r11, LSL #16\n\t" + "SUB r9, r9, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r9, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV r12, #0xd01\n\t" + "MOVT r12, #0xcff\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STR r2, [%[r]]\n\t" + "STR r3, [%[r], #4]\n\t" + "STR r4, [%[r], #8]\n\t" + "STR r5, [%[r], #12]\n\t" + "STR r6, [%[r], #16]\n\t" + "STR r7, [%[r], #20]\n\t" + "STR r8, [%[r], #24]\n\t" + "STR r9, [%[r], #28]\n\t" + "LDR r3, [sp, #4]\n\t" + "ADD r3, r3, #0x10\n\t" + "RSBS r10, r3, #0x100\n\t" + "ADD %[r], %[r], #0x20\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_ntt_loop_567_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_ntt_loop_567\n\t" +#else + "BNE.N L_kyber_thumb2_ntt_loop_567_%=\n\t" +#endif + "ADD sp, sp, #0x8\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [r] "+r" (r), + [L_kyber_thumb2_ntt_zetas] "+r" (L_kyber_thumb2_ntt_zetas_c) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#else + : [r] "+r" (r) + : [L_kyber_thumb2_ntt_zetas] "r" (L_kyber_thumb2_ntt_zetas) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +XALIGNED(16) static const uint16_t L_kyber_thumb2_invntt_zetas_inv[] = { + 0x06a5, 0x070f, 0x05b4, 0x0943, 0x0922, 0x091d, 0x0134, 0x006c, + 0x0b23, 0x0366, 0x0356, 0x05e6, 0x09e7, 0x04fe, 0x05fa, 0x04a1, + 0x067b, 0x04a3, 0x0c25, 0x036a, 0x0537, 0x083f, 0x0088, 0x04bf, + 0x0b81, 0x05b9, 0x0505, 0x07d7, 0x0a9f, 0x0aa6, 0x08b8, 0x09d0, + 0x004b, 0x009c, 0x0bb8, 0x0b5f, 0x0ba4, 0x0368, 0x0a7d, 0x0636, + 0x08a2, 0x025a, 0x0736, 0x0309, 0x0093, 0x087a, 0x09f7, 0x00f6, + 0x068c, 0x06db, 0x01cc, 0x0123, 0x00eb, 0x0c50, 0x0ab6, 0x0b5b, + 0x0c98, 0x06f3, 0x099a, 0x04e3, 0x09b6, 0x0ad6, 0x0b53, 0x044f, + 0x04fb, 0x0a5c, 0x0429, 0x0b41, 0x02d5, 0x05e4, 0x0940, 0x018e, + 0x03b7, 0x00f7, 0x058d, 0x0c96, 0x09c3, 0x010f, 0x005a, 0x0355, + 0x0744, 0x0c83, 0x048a, 0x0652, 0x029a, 0x0140, 0x0008, 0x0afd, + 0x0608, 0x011a, 0x072e, 0x050d, 0x090a, 0x0228, 0x0a75, 0x083a, + 0x0623, 0x00cd, 0x0b66, 0x0606, 0x0aa1, 0x0a25, 0x0908, 0x02a9, + 0x0082, 0x0642, 0x074f, 0x033d, 0x0b82, 0x0bf9, 0x052d, 0x0ac4, + 0x0745, 0x05c2, 0x04b2, 0x093f, 0x0c4b, 0x06d8, 0x0a93, 0x00ab, + 0x0c37, 0x0be2, 0x0773, 0x072c, 0x05ed, 0x0167, 0x02f6, 0x05a1, +}; + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void kyber_thumb2_invntt(sword16* r_p) +#else +void kyber_thumb2_invntt(sword16* r) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register sword16* r __asm__ ("r0") = (sword16*)r_p; + register uint16_t* L_kyber_thumb2_invntt_zetas_inv_c __asm__ ("r1") = (uint16_t*)&L_kyber_thumb2_invntt_zetas_inv; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "SUB sp, sp, #0x8\n\t" + "MOV r1, %[L_kyber_thumb2_invntt_zetas_inv]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV r12, #0xd01\n\t" + "MOVT r12, #0xcff\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "MOV r3, #0x0\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_invntt_loop_765:\n\t" +#else + "L_kyber_thumb2_invntt_loop_765_%=:\n\t" +#endif + "ADD lr, r1, r3, LSR #1\n\t" + "STR r3, [sp, #4]\n\t" + "LDR r2, [%[r]]\n\t" + "LDR r3, [%[r], #4]\n\t" + "LDR r4, [%[r], #8]\n\t" + "LDR r5, [%[r], #12]\n\t" + "LDR r6, [%[r], #16]\n\t" + "LDR r7, [%[r], #20]\n\t" + "LDR r8, [%[r], #24]\n\t" + "LDR r9, [%[r], #28]\n\t" + "LDR lr, [lr]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r2, r3\n\t" + "SADD16 r2, r2, r3\n\t" + "SMULBT r3, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r3\n\t" + "SMLABB r3, r12, r11, r3\n\t" + "PKHTB r3, r3, r10, ASR #16\n\t" +#else + "SUB r11, r2, r3\n\t" + "ADD r12, r2, r3\n\t" + "BFC r3, #0, #16\n\t" + "BFC r2, #0, #16\n\t" + "SUB r10, r2, r3\n\t" + "ADD r2, r2, r3\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r2, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r3, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r3, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r3, r12, r11, r3\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r4, r5\n\t" + "SADD16 r4, r4, r5\n\t" + "SMULTT r5, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r5, r12, r11, r5\n\t" + "PKHTB r5, r5, r10, ASR #16\n\t" +#else + "SUB r11, r4, r5\n\t" + "ADD r12, r4, r5\n\t" + "BFC r5, #0, #16\n\t" + "BFC r4, #0, #16\n\t" + "SUB r10, r4, r5\n\t" + "ADD r4, r4, r5\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r4, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r5, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r5, r12, r11, r5\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [sp, #4]\n\t" + "ADD lr, r1, lr, LSR #1\n\t" + "LDR lr, [lr, #4]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r6, r7\n\t" + "SADD16 r6, r6, r7\n\t" + "SMULBT r7, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r7, r12, r11, r7\n\t" + "PKHTB r7, r7, r10, ASR #16\n\t" +#else + "SUB r11, r6, r7\n\t" + "ADD r12, r6, r7\n\t" + "BFC r7, #0, #16\n\t" + "BFC r6, #0, #16\n\t" + "SUB r10, r6, r7\n\t" + "ADD r6, r6, r7\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r6, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r7, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r7, r12, r11, r7\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r8, r9\n\t" + "SADD16 r8, r8, r9\n\t" + "SMULTT r9, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SUB r11, r8, r9\n\t" + "ADD r12, r8, r9\n\t" + "BFC r9, #0, #16\n\t" + "BFC r8, #0, #16\n\t" + "SUB r10, r8, r9\n\t" + "ADD r8, r8, r9\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r8, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r9, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [sp, #4]\n\t" + "ADD lr, r1, lr, LSR #2\n\t" + "LDR lr, [lr, #128]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r2, r4\n\t" + "SADD16 r2, r2, r4\n\t" + "SMULBT r4, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r4\n\t" + "SMLABB r4, r12, r11, r4\n\t" + "PKHTB r4, r4, r10, ASR #16\n\t" +#else + "SUB r11, r2, r4\n\t" + "ADD r12, r2, r4\n\t" + "BFC r4, #0, #16\n\t" + "BFC r2, #0, #16\n\t" + "SUB r10, r2, r4\n\t" + "ADD r2, r2, r4\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r2, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r4, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r4, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r4, r12, r11, r4\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r3, r5\n\t" + "SADD16 r3, r3, r5\n\t" + "SMULBT r5, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r5, r12, r11, r5\n\t" + "PKHTB r5, r5, r10, ASR #16\n\t" +#else + "SUB r11, r3, r5\n\t" + "ADD r12, r3, r5\n\t" + "BFC r5, #0, #16\n\t" + "BFC r3, #0, #16\n\t" + "SUB r10, r3, r5\n\t" + "ADD r3, r3, r5\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r3, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r5, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r5, r12, r11, r5\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r6, r8\n\t" + "SADD16 r6, r6, r8\n\t" + "SMULTT r8, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r8, r12, r11, r8\n\t" + "PKHTB r8, r8, r10, ASR #16\n\t" +#else + "SUB r11, r6, r8\n\t" + "ADD r12, r6, r8\n\t" + "BFC r8, #0, #16\n\t" + "BFC r6, #0, #16\n\t" + "SUB r10, r6, r8\n\t" + "ADD r6, r6, r8\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r6, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r8, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r8, r12, r11, r8\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r7, r9\n\t" + "SADD16 r7, r7, r9\n\t" + "SMULTT r9, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SUB r11, r7, r9\n\t" + "ADD r12, r7, r9\n\t" + "BFC r9, #0, #16\n\t" + "BFC r7, #0, #16\n\t" + "SUB r10, r7, r9\n\t" + "ADD r7, r7, r9\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r7, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r9, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [sp, #4]\n\t" + "ADD lr, r1, lr, LSR #3\n\t" + "LDR lr, [lr, #192]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r2, r6\n\t" + "SADD16 r2, r2, r6\n\t" + "SMULBT r6, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r6\n\t" + "SMLABB r6, r12, r11, r6\n\t" + "PKHTB r6, r6, r10, ASR #16\n\t" +#else + "SUB r11, r2, r6\n\t" + "ADD r12, r2, r6\n\t" + "BFC r6, #0, #16\n\t" + "BFC r2, #0, #16\n\t" + "SUB r10, r2, r6\n\t" + "ADD r2, r2, r6\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r2, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r6, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r6, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r6, r12, r11, r6\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r3, r7\n\t" + "SADD16 r3, r3, r7\n\t" + "SMULBT r7, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r7, r12, r11, r7\n\t" + "PKHTB r7, r7, r10, ASR #16\n\t" +#else + "SUB r11, r3, r7\n\t" + "ADD r12, r3, r7\n\t" + "BFC r7, #0, #16\n\t" + "BFC r3, #0, #16\n\t" + "SUB r10, r3, r7\n\t" + "ADD r3, r3, r7\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r3, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r7, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r7, r12, r11, r7\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r4, r8\n\t" + "SADD16 r4, r4, r8\n\t" + "SMULBT r8, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r8, r12, r11, r8\n\t" + "PKHTB r8, r8, r10, ASR #16\n\t" +#else + "SUB r11, r4, r8\n\t" + "ADD r12, r4, r8\n\t" + "BFC r8, #0, #16\n\t" + "BFC r4, #0, #16\n\t" + "SUB r10, r4, r8\n\t" + "ADD r4, r4, r8\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r4, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r8, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r8, r12, r11, r8\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r5, r9\n\t" + "SADD16 r5, r5, r9\n\t" + "SMULBT r9, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SUB r11, r5, r9\n\t" + "ADD r12, r5, r9\n\t" + "BFC r9, #0, #16\n\t" + "BFC r5, #0, #16\n\t" + "SUB r10, r5, r9\n\t" + "ADD r5, r5, r9\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r5, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r9, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV lr, #0xafc0\n\t" + "MOVT lr, #0x13\n\t" +#else + "MOV lr, #0x4ebf\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r2\n\t" + "SMULWT r11, lr, r2\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r2, r2, r10\n\t" +#else + "SBFX r10, r2, #0, #16\n\t" + "SBFX r11, r2, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r2, r11, LSL #16\n\t" + "SUB r2, r2, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r2, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r3\n\t" + "SMULWT r11, lr, r3\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r3, r3, r10\n\t" +#else + "SBFX r10, r3, #0, #16\n\t" + "SBFX r11, r3, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r3, r11, LSL #16\n\t" + "SUB r3, r3, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r3, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r4\n\t" + "SMULWT r11, lr, r4\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r4, r4, r10\n\t" +#else + "SBFX r10, r4, #0, #16\n\t" + "SBFX r11, r4, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r4, r11, LSL #16\n\t" + "SUB r4, r4, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r4, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r5\n\t" + "SMULWT r11, lr, r5\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r5, r5, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, r5, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r5, r11, LSL #16\n\t" + "SUB r5, r5, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r5, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STR r2, [%[r]]\n\t" + "STR r3, [%[r], #4]\n\t" + "STR r4, [%[r], #8]\n\t" + "STR r5, [%[r], #12]\n\t" + "STR r6, [%[r], #16]\n\t" + "STR r7, [%[r], #20]\n\t" + "STR r8, [%[r], #24]\n\t" + "STR r9, [%[r], #28]\n\t" + "LDR r3, [sp, #4]\n\t" + "ADD r3, r3, #0x10\n\t" + "RSBS r10, r3, #0x100\n\t" + "ADD %[r], %[r], #0x20\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_invntt_loop_765_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_invntt_loop_765\n\t" +#else + "BNE.N L_kyber_thumb2_invntt_loop_765_%=\n\t" +#endif + "SUB %[r], %[r], #0x200\n\t" + "MOV r3, #0x0\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_invntt_loop_4_j:\n\t" +#else + "L_kyber_thumb2_invntt_loop_4_j_%=:\n\t" +#endif + "STR r3, [sp, #4]\n\t" + "ADD lr, r1, r3, LSR #4\n\t" + "MOV r2, #0x4\n\t" + "LDR lr, [lr, #224]\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_invntt_loop_4_i:\n\t" +#else + "L_kyber_thumb2_invntt_loop_4_i_%=:\n\t" +#endif + "STR r2, [sp]\n\t" + "LDR r2, [%[r]]\n\t" + "LDR r3, [%[r], #16]\n\t" + "LDR r4, [%[r], #32]\n\t" + "LDR r5, [%[r], #48]\n\t" + "LDR r6, [%[r], #64]\n\t" + "LDR r7, [%[r], #80]\n\t" + "LDR r8, [%[r], #96]\n\t" + "LDR r9, [%[r], #112]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r2, r4\n\t" + "SADD16 r2, r2, r4\n\t" + "SMULBT r4, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r4\n\t" + "SMLABB r4, r12, r11, r4\n\t" + "PKHTB r4, r4, r10, ASR #16\n\t" +#else + "SUB r11, r2, r4\n\t" + "ADD r12, r2, r4\n\t" + "BFC r4, #0, #16\n\t" + "BFC r2, #0, #16\n\t" + "SUB r10, r2, r4\n\t" + "ADD r2, r2, r4\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r2, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r4, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r4, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r4, r12, r11, r4\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r3, r5\n\t" + "SADD16 r3, r3, r5\n\t" + "SMULBT r5, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r5, r12, r11, r5\n\t" + "PKHTB r5, r5, r10, ASR #16\n\t" +#else + "SUB r11, r3, r5\n\t" + "ADD r12, r3, r5\n\t" + "BFC r5, #0, #16\n\t" + "BFC r3, #0, #16\n\t" + "SUB r10, r3, r5\n\t" + "ADD r3, r3, r5\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r3, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r5, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r5, r12, r11, r5\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r6, r8\n\t" + "SADD16 r6, r6, r8\n\t" + "SMULTT r8, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r8, r12, r11, r8\n\t" + "PKHTB r8, r8, r10, ASR #16\n\t" +#else + "SUB r11, r6, r8\n\t" + "ADD r12, r6, r8\n\t" + "BFC r8, #0, #16\n\t" + "BFC r6, #0, #16\n\t" + "SUB r10, r6, r8\n\t" + "ADD r6, r6, r8\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r6, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r8, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r8, r12, r11, r8\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r7, r9\n\t" + "SADD16 r7, r7, r9\n\t" + "SMULTT r9, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SUB r11, r7, r9\n\t" + "ADD r12, r7, r9\n\t" + "BFC r9, #0, #16\n\t" + "BFC r7, #0, #16\n\t" + "SUB r10, r7, r9\n\t" + "ADD r7, r7, r9\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r7, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r9, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STR r2, [%[r]]\n\t" + "STR r3, [%[r], #16]\n\t" + "STR r4, [%[r], #32]\n\t" + "STR r5, [%[r], #48]\n\t" + "STR r6, [%[r], #64]\n\t" + "STR r7, [%[r], #80]\n\t" + "STR r8, [%[r], #96]\n\t" + "STR r9, [%[r], #112]\n\t" + "LDRD r2, r3, [sp]\n\t" + "SUBS r2, r2, #0x1\n\t" + "ADD %[r], %[r], #0x4\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_invntt_loop_4_i_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_invntt_loop_4_i\n\t" +#else + "BNE.N L_kyber_thumb2_invntt_loop_4_i_%=\n\t" +#endif + "ADD r3, r3, #0x40\n\t" + "RSBS r10, r3, #0x100\n\t" + "ADD %[r], %[r], #0x70\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_invntt_loop_4_j_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_invntt_loop_4_j\n\t" +#else + "BNE.N L_kyber_thumb2_invntt_loop_4_j_%=\n\t" +#endif + "SUB %[r], %[r], #0x200\n\t" + "MOV r2, #0x10\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_invntt_loop_321:\n\t" +#else + "L_kyber_thumb2_invntt_loop_321_%=:\n\t" +#endif + "STR r2, [sp]\n\t" + "LDRH lr, [r1, #2]\n\t" + "LDR r2, [%[r]]\n\t" + "LDR r3, [%[r], #64]\n\t" + "LDR r4, [%[r], #128]\n\t" + "LDR r5, [%[r], #192]\n\t" + "LDR r6, [%[r], #256]\n\t" + "LDR r7, [%[r], #320]\n\t" + "LDR r8, [%[r], #384]\n\t" + "LDR r9, [%[r], #448]\n\t" + "LDR lr, [r1, #240]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r2, r3\n\t" + "SADD16 r2, r2, r3\n\t" + "SMULBT r3, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r3\n\t" + "SMLABB r3, r12, r11, r3\n\t" + "PKHTB r3, r3, r10, ASR #16\n\t" +#else + "SUB r11, r2, r3\n\t" + "ADD r12, r2, r3\n\t" + "BFC r3, #0, #16\n\t" + "BFC r2, #0, #16\n\t" + "SUB r10, r2, r3\n\t" + "ADD r2, r2, r3\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r2, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r3, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r3, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r3, r12, r11, r3\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r4, r5\n\t" + "SADD16 r4, r4, r5\n\t" + "SMULTT r5, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r5, r12, r11, r5\n\t" + "PKHTB r5, r5, r10, ASR #16\n\t" +#else + "SUB r11, r4, r5\n\t" + "ADD r12, r4, r5\n\t" + "BFC r5, #0, #16\n\t" + "BFC r4, #0, #16\n\t" + "SUB r10, r4, r5\n\t" + "ADD r4, r4, r5\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r4, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r5, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r5, r12, r11, r5\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [r1, #244]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r6, r7\n\t" + "SADD16 r6, r6, r7\n\t" + "SMULBT r7, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r7, r12, r11, r7\n\t" + "PKHTB r7, r7, r10, ASR #16\n\t" +#else + "SUB r11, r6, r7\n\t" + "ADD r12, r6, r7\n\t" + "BFC r7, #0, #16\n\t" + "BFC r6, #0, #16\n\t" + "SUB r10, r6, r7\n\t" + "ADD r6, r6, r7\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r6, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r7, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r7, r12, r11, r7\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r8, r9\n\t" + "SADD16 r8, r8, r9\n\t" + "SMULTT r9, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SUB r11, r8, r9\n\t" + "ADD r12, r8, r9\n\t" + "BFC r9, #0, #16\n\t" + "BFC r8, #0, #16\n\t" + "SUB r10, r8, r9\n\t" + "ADD r8, r8, r9\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r8, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r9, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [r1, #248]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r2, r4\n\t" + "SADD16 r2, r2, r4\n\t" + "SMULBT r4, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r4\n\t" + "SMLABB r4, r12, r11, r4\n\t" + "PKHTB r4, r4, r10, ASR #16\n\t" +#else + "SUB r11, r2, r4\n\t" + "ADD r12, r2, r4\n\t" + "BFC r4, #0, #16\n\t" + "BFC r2, #0, #16\n\t" + "SUB r10, r2, r4\n\t" + "ADD r2, r2, r4\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r2, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r4, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r4, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r4, r12, r11, r4\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r3, r5\n\t" + "SADD16 r3, r3, r5\n\t" + "SMULBT r5, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r5, r12, r11, r5\n\t" + "PKHTB r5, r5, r10, ASR #16\n\t" +#else + "SUB r11, r3, r5\n\t" + "ADD r12, r3, r5\n\t" + "BFC r5, #0, #16\n\t" + "BFC r3, #0, #16\n\t" + "SUB r10, r3, r5\n\t" + "ADD r3, r3, r5\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r3, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r5, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r5, r12, r11, r5\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r6, r8\n\t" + "SADD16 r6, r6, r8\n\t" + "SMULTT r8, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r8, r12, r11, r8\n\t" + "PKHTB r8, r8, r10, ASR #16\n\t" +#else + "SUB r11, r6, r8\n\t" + "ADD r12, r6, r8\n\t" + "BFC r8, #0, #16\n\t" + "BFC r6, #0, #16\n\t" + "SUB r10, r6, r8\n\t" + "ADD r6, r6, r8\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r6, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r8, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r8, r12, r11, r8\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r7, r9\n\t" + "SADD16 r7, r7, r9\n\t" + "SMULTT r9, lr, r10\n\t" + "SMULTB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SUB r11, r7, r9\n\t" + "ADD r12, r7, r9\n\t" + "BFC r9, #0, #16\n\t" + "BFC r7, #0, #16\n\t" + "SUB r10, r7, r9\n\t" + "ADD r7, r7, r9\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r7, r12, #0, #16\n\t" + "SBFX r11, lr, #16, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r9, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV lr, #0xafc0\n\t" + "MOVT lr, #0x13\n\t" +#else + "MOV lr, #0x4ebf\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r2\n\t" + "SMULWT r11, lr, r2\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r2, r2, r10\n\t" +#else + "SBFX r10, r2, #0, #16\n\t" + "SBFX r11, r2, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r2, r11, LSL #16\n\t" + "SUB r2, r2, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r2, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r3\n\t" + "SMULWT r11, lr, r3\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r3, r3, r10\n\t" +#else + "SBFX r10, r3, #0, #16\n\t" + "SBFX r11, r3, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r3, r11, LSL #16\n\t" + "SUB r3, r3, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r3, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r4\n\t" + "SMULWT r11, lr, r4\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r4, r4, r10\n\t" +#else + "SBFX r10, r4, #0, #16\n\t" + "SBFX r11, r4, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r4, r11, LSL #16\n\t" + "SUB r4, r4, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r4, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULWB r10, lr, r5\n\t" + "SMULWT r11, lr, r5\n\t" + "SMULBT r10, r12, r10\n\t" + "SMULBT r11, r12, r11\n\t" + "PKHBT r10, r10, r11, LSL #16\n\t" + "SSUB16 r5, r5, r10\n\t" +#else + "SBFX r10, r5, #0, #16\n\t" + "SBFX r11, r5, #16, #16\n\t" + "MUL r10, lr, r10\n\t" + "MUL r11, lr, r11\n\t" + "ASR r10, r10, #26\n\t" + "ASR r11, r11, #26\n\t" + "MUL r10, r12, r10\n\t" + "MUL r11, r12, r11\n\t" + "SUB r11, r5, r11, LSL #16\n\t" + "SUB r5, r5, r10\n\t" + "LSR r11, r11, #16\n\t" + "BFI r5, r11, #16, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [r1, #252]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r2, r6\n\t" + "SADD16 r2, r2, r6\n\t" + "SMULBT r6, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r6\n\t" + "SMLABB r6, r12, r11, r6\n\t" + "PKHTB r6, r6, r10, ASR #16\n\t" +#else + "SUB r11, r2, r6\n\t" + "ADD r12, r2, r6\n\t" + "BFC r6, #0, #16\n\t" + "BFC r2, #0, #16\n\t" + "SUB r10, r2, r6\n\t" + "ADD r2, r2, r6\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r2, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r6, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r6, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r6, r12, r11, r6\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r3, r7\n\t" + "SADD16 r3, r3, r7\n\t" + "SMULBT r7, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r7, r12, r11, r7\n\t" + "PKHTB r7, r7, r10, ASR #16\n\t" +#else + "SUB r11, r3, r7\n\t" + "ADD r12, r3, r7\n\t" + "BFC r7, #0, #16\n\t" + "BFC r3, #0, #16\n\t" + "SUB r10, r3, r7\n\t" + "ADD r3, r3, r7\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r3, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r7, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r7, r12, r11, r7\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r4, r8\n\t" + "SADD16 r4, r4, r8\n\t" + "SMULBT r8, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r8, r12, r11, r8\n\t" + "PKHTB r8, r8, r10, ASR #16\n\t" +#else + "SUB r11, r4, r8\n\t" + "ADD r12, r4, r8\n\t" + "BFC r8, #0, #16\n\t" + "BFC r4, #0, #16\n\t" + "SUB r10, r4, r8\n\t" + "ADD r4, r4, r8\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r4, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r8, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r8, r12, r11, r8\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r10, r5, r9\n\t" + "SADD16 r5, r5, r9\n\t" + "SMULBT r9, lr, r10\n\t" + "SMULBB r10, lr, r10\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SUB r11, r5, r9\n\t" + "ADD r12, r5, r9\n\t" + "BFC r9, #0, #16\n\t" + "BFC r5, #0, #16\n\t" + "SUB r10, r5, r9\n\t" + "ADD r5, r5, r9\n\t" + "BFI r10, r11, #0, #16\n\t" + "BFI r5, r12, #0, #16\n\t" + "SBFX r11, lr, #0, #16\n\t" + "ASR r12, r10, #16\n\t" + "MUL r9, r11, r12\n\t" + "SBFX r10, r10, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "LDR lr, [r1, #254]\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r2\n\t" + "SMULBT r2, lr, r2\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r2\n\t" + "SMLABB r2, r12, r11, r2\n\t" + "PKHTB r2, r2, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r2, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r2, r2, #16, #16\n\t" + "MUL r2, r11, r2\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r2, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r2, r12, r11, r2\n\t" + "BFI r2, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r3\n\t" + "SMULBT r3, lr, r3\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r3\n\t" + "SMLABB r3, r12, r11, r3\n\t" + "PKHTB r3, r3, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r3, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r3, r3, #16, #16\n\t" + "MUL r3, r11, r3\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r3, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r3, r12, r11, r3\n\t" + "BFI r3, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r4\n\t" + "SMULBT r4, lr, r4\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r4\n\t" + "SMLABB r4, r12, r11, r4\n\t" + "PKHTB r4, r4, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r4, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r4, r4, #16, #16\n\t" + "MUL r4, r11, r4\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r4, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r4, r12, r11, r4\n\t" + "BFI r4, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r5\n\t" + "SMULBT r5, lr, r5\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r5\n\t" + "SMLABB r5, r12, r11, r5\n\t" + "PKHTB r5, r5, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r5, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r5, r5, #16, #16\n\t" + "MUL r5, r11, r5\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r5, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r5, r12, r11, r5\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r6\n\t" + "SMULBT r6, lr, r6\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r6\n\t" + "SMLABB r6, r12, r11, r6\n\t" + "PKHTB r6, r6, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r6, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r6, r6, #16, #16\n\t" + "MUL r6, r11, r6\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r6, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r6, r12, r11, r6\n\t" + "BFI r6, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r7\n\t" + "SMULBT r7, lr, r7\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r7\n\t" + "SMLABB r7, r12, r11, r7\n\t" + "PKHTB r7, r7, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r7, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r7, r7, #16, #16\n\t" + "MUL r7, r11, r7\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r7, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r7, r12, r11, r7\n\t" + "BFI r7, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r8\n\t" + "SMULBT r8, lr, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r8\n\t" + "SMLABB r8, r12, r11, r8\n\t" + "PKHTB r8, r8, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r8, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r8, r8, #16, #16\n\t" + "MUL r8, r11, r8\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r8, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r8, r12, r11, r8\n\t" + "BFI r8, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULBB r10, lr, r9\n\t" + "SMULBT r9, lr, r9\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULTB r11, r12, r9\n\t" + "SMLABB r9, r12, r11, r9\n\t" + "PKHTB r9, r9, r10, ASR #16\n\t" +#else + "SBFX r11, lr, #0, #16\n\t" + "SBFX r10, r9, #0, #16\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r9, r9, #16, #16\n\t" + "MUL r9, r11, r9\n\t" + "MOV r12, #0xcff\n\t" + "MUL r11, r12, r10\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r10, r12, r11, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r11, r9, #0, #16\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r11, r11, #0, #16\n\t" + "LSR r10, r10, #16\n\t" + "MLA r9, r12, r11, r9\n\t" + "BFI r9, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STR r2, [%[r]]\n\t" + "STR r3, [%[r], #64]\n\t" + "STR r4, [%[r], #128]\n\t" + "STR r5, [%[r], #192]\n\t" + "STR r6, [%[r], #256]\n\t" + "STR r7, [%[r], #320]\n\t" + "STR r8, [%[r], #384]\n\t" + "STR r9, [%[r], #448]\n\t" + "LDR r2, [sp]\n\t" + "SUBS r2, r2, #0x1\n\t" + "ADD %[r], %[r], #0x4\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_invntt_loop_321_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_invntt_loop_321\n\t" +#else + "BNE.N L_kyber_thumb2_invntt_loop_321_%=\n\t" +#endif + "ADD sp, sp, #0x8\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [r] "+r" (r), + [L_kyber_thumb2_invntt_zetas_inv] "+r" (L_kyber_thumb2_invntt_zetas_inv_c) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#else + : [r] "+r" (r) + : [L_kyber_thumb2_invntt_zetas_inv] "r" (L_kyber_thumb2_invntt_zetas_inv) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +XALIGNED(16) static const uint16_t L_kyber_thumb2_basemul_mont_zetas[] = { + 0x08ed, 0x0a0b, 0x0b9a, 0x0714, 0x05d5, 0x058e, 0x011f, 0x00ca, + 0x0c56, 0x026e, 0x0629, 0x00b6, 0x03c2, 0x084f, 0x073f, 0x05bc, + 0x023d, 0x07d4, 0x0108, 0x017f, 0x09c4, 0x05b2, 0x06bf, 0x0c7f, + 0x0a58, 0x03f9, 0x02dc, 0x0260, 0x06fb, 0x019b, 0x0c34, 0x06de, + 0x04c7, 0x028c, 0x0ad9, 0x03f7, 0x07f4, 0x05d3, 0x0be7, 0x06f9, + 0x0204, 0x0cf9, 0x0bc1, 0x0a67, 0x06af, 0x0877, 0x007e, 0x05bd, + 0x09ac, 0x0ca7, 0x0bf2, 0x033e, 0x006b, 0x0774, 0x0c0a, 0x094a, + 0x0b73, 0x03c1, 0x071d, 0x0a2c, 0x01c0, 0x08d8, 0x02a5, 0x0806, + 0x08b2, 0x01ae, 0x022b, 0x034b, 0x081e, 0x0367, 0x060e, 0x0069, + 0x01a6, 0x024b, 0x00b1, 0x0c16, 0x0bde, 0x0b35, 0x0626, 0x0675, + 0x0c0b, 0x030a, 0x0487, 0x0c6e, 0x09f8, 0x05cb, 0x0aa7, 0x045f, + 0x06cb, 0x0284, 0x0999, 0x015d, 0x01a2, 0x0149, 0x0c65, 0x0cb6, + 0x0331, 0x0449, 0x025b, 0x0262, 0x052a, 0x07fc, 0x0748, 0x0180, + 0x0842, 0x0c79, 0x04c2, 0x07ca, 0x0997, 0x00dc, 0x085e, 0x0686, + 0x0860, 0x0707, 0x0803, 0x031a, 0x071b, 0x09ab, 0x099b, 0x01de, + 0x0c95, 0x0bcd, 0x03e4, 0x03df, 0x03be, 0x074d, 0x05f2, 0x065c, +}; + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void kyber_thumb2_basemul_mont(sword16* r_p, const sword16* a_p, const sword16* b_p) +#else +void kyber_thumb2_basemul_mont(sword16* r, const sword16* a, const sword16* b) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register sword16* r __asm__ ("r0") = (sword16*)r_p; + register const sword16* a __asm__ ("r1") = (const sword16*)a_p; + register const sword16* b __asm__ ("r2") = (const sword16*)b_p; + register uint16_t* L_kyber_thumb2_basemul_mont_zetas_c __asm__ ("r3") = (uint16_t*)&L_kyber_thumb2_basemul_mont_zetas; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r3, %[L_kyber_thumb2_basemul_mont_zetas]\n\t" + "ADD r3, r3, #0x80\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV r12, #0xd01\n\t" + "MOVT r12, #0xcff\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "MOV r8, #0x0\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_basemul_mont_loop:\n\t" +#else + "L_kyber_thumb2_basemul_mont_loop_%=:\n\t" +#endif + "LDM %[a]!, {r4, r5}\n\t" + "LDM %[b]!, {r6, r7}\n\t" + "LDR lr, [r3, r8]\n\t" + "ADD r8, r8, #0x2\n\t" + "PUSH {r8}\n\t" + "CMP r8, #0x80\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTT r8, r4, r6\n\t" + "SMULTT r10, r5, r7\n\t" + "SMULTB r9, r12, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r8, r12, r9, r8\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "RSB r11, lr, #0x0\n\t" + "SMULBT r8, lr, r8\n\t" + "SMULBT r10, r11, r10\n\t" + "SMLABB r8, r4, r6, r8\n\t" + "SMLABB r10, r5, r7, r10\n\t" + "SMULTB r9, r12, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r8, r12, r9, r8\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULBT r9, r4, r6\n\t" + "SMULBT r11, r5, r7\n\t" + "SMLATB r9, r4, r6, r9\n\t" + "SMLATB r11, r5, r7, r11\n\t" + "SMULTB r6, r12, r9\n\t" + "SMULTB r7, r12, r11\n\t" + "SMLABB r9, r12, r6, r9\n\t" + "SMLABB r11, r12, r7, r11\n\t" + "PKHTB r4, r9, r8, ASR #16\n\t" + "PKHTB r5, r11, r10, ASR #16\n\t" +#else + "ASR r8, r4, #16\n\t" + "ASR r10, r5, #16\n\t" + "ASR r9, r6, #16\n\t" + "ASR r11, r7, #16\n\t" + "MUL r8, r8, r9\n\t" + "MUL r10, r10, r11\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r9, r8, #0, #16\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r9, r12, r8\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r9, r9, #0, #16\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r8, r12, r9, r8\n\t" + "MLA r10, r12, r11, r10\n\t" + "RSB r11, lr, #0x0\n\t" + "SBFX r9, lr, #0, #16\n\t" + "SBFX r11, r11, #0, #16\n\t" + "ASR r8, r8, #16\n\t" + "ASR r10, r10, #16\n\t" + "MUL r8, r9, r8\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r9, r4, #0, #16\n\t" + "SBFX r11, r5, #0, #16\n\t" + "SBFX r12, r6, #0, #16\n\t" + "MLA r8, r9, r12, r8\n\t" + "SBFX r12, r7, #0, #16\n\t" + "MLA r10, r11, r12, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r9, r8, #0, #16\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r9, r12, r9\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r9, r9, #0, #16\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r8, r12, r9, r8\n\t" + "MLA r10, r12, r11, r10\n\t" + "SBFX r9, r4, #0, #16\n\t" + "SBFX r11, r5, #0, #16\n\t" + "ASR r12, r6, #16\n\t" + "MUL r9, r9, r12\n\t" + "ASR r12, r7, #16\n\t" + "MUL r11, r11, r12\n\t" + "ASR r4, r4, #16\n\t" + "ASR r5, r5, #16\n\t" + "SBFX r12, r6, #0, #16\n\t" + "MLA r9, r4, r12, r9\n\t" + "SBFX r12, r7, #0, #16\n\t" + "MLA r11, r5, r12, r11\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r6, r9, #0, #16\n\t" + "SBFX r7, r11, #0, #16\n\t" + "MUL r6, r12, r6\n\t" + "MUL r7, r12, r7\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r4, r6, #0, #16\n\t" + "SBFX r5, r7, #0, #16\n\t" + "MLA r9, r12, r4, r9\n\t" + "MLA r11, r12, r5, r11\n\t" + "BFC r9, #0, #16\n\t" + "BFC r11, #0, #16\n\t" + "ORR r4, r9, r8, LSR #16\n\t" + "ORR r5, r11, r10, LSR #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STM %[r]!, {r4, r5}\n\t" + "POP {r8}\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_basemul_mont_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_basemul_mont_loop\n\t" +#else + "BNE.N L_kyber_thumb2_basemul_mont_loop_%=\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), + [L_kyber_thumb2_basemul_mont_zetas] "+r" (L_kyber_thumb2_basemul_mont_zetas_c) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#else + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_thumb2_basemul_mont_zetas] "r" (L_kyber_thumb2_basemul_mont_zetas) + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void kyber_thumb2_basemul_mont_add(sword16* r_p, const sword16* a_p, const sword16* b_p) +#else +void kyber_thumb2_basemul_mont_add(sword16* r, const sword16* a, const sword16* b) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register sword16* r __asm__ ("r0") = (sword16*)r_p; + register const sword16* a __asm__ ("r1") = (const sword16*)a_p; + register const sword16* b __asm__ ("r2") = (const sword16*)b_p; + register uint16_t* L_kyber_thumb2_basemul_mont_zetas_c __asm__ ("r3") = (uint16_t*)&L_kyber_thumb2_basemul_mont_zetas; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r3, %[L_kyber_thumb2_basemul_mont_zetas]\n\t" + "ADD r3, r3, #0x80\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "MOV r12, #0xd01\n\t" + "MOVT r12, #0xcff\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "MOV r8, #0x0\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_basemul_mont_add_loop:\n\t" +#else + "L_kyber_thumb2_basemul_mont_add_loop_%=:\n\t" +#endif + "LDM %[a]!, {r4, r5}\n\t" + "LDM %[b]!, {r6, r7}\n\t" + "LDR lr, [r3, r8]\n\t" + "ADD r8, r8, #0x2\n\t" + "PUSH {r8}\n\t" + "CMP r8, #0x80\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SMULTT r8, r4, r6\n\t" + "SMULTT r10, r5, r7\n\t" + "SMULTB r9, r12, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r8, r12, r9, r8\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "RSB r11, lr, #0x0\n\t" + "SMULBT r8, lr, r8\n\t" + "SMULBT r10, r11, r10\n\t" + "SMLABB r8, r4, r6, r8\n\t" + "SMLABB r10, r5, r7, r10\n\t" + "SMULTB r9, r12, r8\n\t" + "SMULTB r11, r12, r10\n\t" + "SMLABB r8, r12, r9, r8\n\t" + "SMLABB r10, r12, r11, r10\n\t" + "SMULBT r9, r4, r6\n\t" + "SMULBT r11, r5, r7\n\t" + "SMLATB r9, r4, r6, r9\n\t" + "SMLATB r11, r5, r7, r11\n\t" + "SMULTB r6, r12, r9\n\t" + "SMULTB r7, r12, r11\n\t" + "SMLABB r9, r12, r6, r9\n\t" + "SMLABB r11, r12, r7, r11\n\t" + "LDM %[r], {r4, r5}\n\t" + "PKHTB r9, r9, r8, ASR #16\n\t" + "PKHTB r11, r11, r10, ASR #16\n\t" + "SADD16 r4, r4, r9\n\t" + "SADD16 r5, r5, r11\n\t" +#else + "ASR r8, r4, #16\n\t" + "ASR r10, r5, #16\n\t" + "ASR r9, r6, #16\n\t" + "ASR r11, r7, #16\n\t" + "MUL r8, r8, r9\n\t" + "MUL r10, r10, r11\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r9, r8, #0, #16\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r9, r12, r8\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r9, r9, #0, #16\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r8, r12, r9, r8\n\t" + "MLA r10, r12, r11, r10\n\t" + "RSB r11, lr, #0x0\n\t" + "SBFX r9, lr, #0, #16\n\t" + "SBFX r11, r11, #0, #16\n\t" + "ASR r8, r8, #16\n\t" + "ASR r10, r10, #16\n\t" + "MUL r8, r9, r8\n\t" + "MUL r10, r11, r10\n\t" + "SBFX r9, r4, #0, #16\n\t" + "SBFX r11, r5, #0, #16\n\t" + "SBFX r12, r6, #0, #16\n\t" + "MLA r8, r9, r12, r8\n\t" + "SBFX r12, r7, #0, #16\n\t" + "MLA r10, r11, r12, r10\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r9, r8, #0, #16\n\t" + "SBFX r11, r10, #0, #16\n\t" + "MUL r9, r12, r9\n\t" + "MUL r11, r12, r11\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r9, r9, #0, #16\n\t" + "SBFX r11, r11, #0, #16\n\t" + "MLA r8, r12, r9, r8\n\t" + "MLA r10, r12, r11, r10\n\t" + "SBFX r9, r4, #0, #16\n\t" + "SBFX r11, r5, #0, #16\n\t" + "ASR r12, r6, #16\n\t" + "MUL r9, r9, r12\n\t" + "ASR r12, r7, #16\n\t" + "MUL r11, r11, r12\n\t" + "ASR r4, r4, #16\n\t" + "ASR r5, r5, #16\n\t" + "SBFX r12, r6, #0, #16\n\t" + "MLA r9, r4, r12, r9\n\t" + "SBFX r12, r7, #0, #16\n\t" + "MLA r11, r5, r12, r11\n\t" + "MOV r12, #0xcff\n\t" + "SBFX r6, r9, #0, #16\n\t" + "SBFX r7, r11, #0, #16\n\t" + "MUL r6, r12, r6\n\t" + "MUL r7, r12, r7\n\t" + "MOV r12, #0xd01\n\t" + "SBFX r4, r6, #0, #16\n\t" + "SBFX r5, r7, #0, #16\n\t" + "MLA r9, r12, r4, r9\n\t" + "MLA r11, r12, r5, r11\n\t" + "LDM %[r], {r4, r5}\n\t" + "BFC r9, #0, #16\n\t" + "BFC r11, #0, #16\n\t" + "ORR r9, r9, r8, LSR #16\n\t" + "ORR r11, r11, r10, LSR #16\n\t" + "ADD r8, r4, r9\n\t" + "ADD r10, r5, r11\n\t" + "BFC r9, #0, #16\n\t" + "BFC r11, #0, #16\n\t" + "ADD r4, r4, r9\n\t" + "ADD r5, r5, r11\n\t" + "BFI r4, r8, #0, #16\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STM %[r]!, {r4, r5}\n\t" + "POP {r8}\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_basemul_mont_add_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_basemul_mont_add_loop\n\t" +#else + "BNE.N L_kyber_thumb2_basemul_mont_add_loop_%=\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b), + [L_kyber_thumb2_basemul_mont_zetas] "+r" (L_kyber_thumb2_basemul_mont_zetas_c) + : + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#else + : [r] "+r" (r), [a] "+r" (a), [b] "+r" (b) + : [L_kyber_thumb2_basemul_mont_zetas] "r" (L_kyber_thumb2_basemul_mont_zetas) + : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +void kyber_thumb2_csubq(sword16* p_p) +#else +void kyber_thumb2_csubq(sword16* p) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register sword16* p __asm__ ("r0") = (sword16*)p_p; + register uint16_t* L_kyber_thumb2_basemul_mont_zetas_c __asm__ ("r1") = (uint16_t*)&L_kyber_thumb2_basemul_mont_zetas; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r11, #0xd01\n\t" + "MOV r12, #0xd01\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "MOVT r12, #0xd01\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "MOV lr, #0x8000\n\t" + "MOVT lr, #0x8000\n\t" + "MOV r1, #0x100\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_csubq_loop:\n\t" +#else + "L_kyber_thumb2_csubq_loop_%=:\n\t" +#endif + "LDM %[p], {r2, r3, r4, r5}\n\t" +#ifndef WOLFSSL_ARM_ARCH_7M + "SSUB16 r2, r2, r12\n\t" + "SSUB16 r3, r3, r12\n\t" + "SSUB16 r4, r4, r12\n\t" + "SSUB16 r5, r5, r12\n\t" + "AND r6, r2, lr\n\t" + "AND r7, r3, lr\n\t" + "AND r8, r4, lr\n\t" + "AND r9, r5, lr\n\t" + "LSR r6, r6, #15\n\t" + "LSR r7, r7, #15\n\t" + "LSR r8, r8, #15\n\t" + "LSR r9, r9, #15\n\t" + "MUL r6, r6, r11\n\t" + "MUL r7, r7, r11\n\t" + "MUL r8, r8, r11\n\t" + "MUL r9, r9, r11\n\t" + "SADD16 r2, r2, r6\n\t" + "SADD16 r3, r3, r7\n\t" + "SADD16 r4, r4, r8\n\t" + "SADD16 r5, r5, r9\n\t" +#else + "SUB r6, r2, r12\n\t" + "SUB r2, r2, r12, LSL #16\n\t" + "BFI r2, r6, #0, #16\n\t" + "SUB r7, r3, r12\n\t" + "SUB r3, r3, r12, LSL #16\n\t" + "BFI r3, r7, #0, #16\n\t" + "SUB r8, r4, r12\n\t" + "SUB r4, r4, r12, LSL #16\n\t" + "BFI r4, r8, #0, #16\n\t" + "SUB r9, r5, r12\n\t" + "SUB r5, r5, r12, LSL #16\n\t" + "BFI r5, r9, #0, #16\n\t" + "AND r6, r2, lr\n\t" + "AND r7, r3, lr\n\t" + "AND r8, r4, lr\n\t" + "AND r9, r5, lr\n\t" + "LSR r6, r6, #15\n\t" + "LSR r7, r7, #15\n\t" + "LSR r8, r8, #15\n\t" + "LSR r9, r9, #15\n\t" + "MUL r6, r6, r11\n\t" + "MUL r7, r7, r11\n\t" + "MUL r8, r8, r11\n\t" + "MUL r9, r9, r11\n\t" + "ADD r10, r2, r6\n\t" + "BFC r6, #0, #16\n\t" + "ADD r2, r2, r6\n\t" + "BFI r2, r10, #0, #16\n\t" + "ADD r10, r3, r7\n\t" + "BFC r7, #0, #16\n\t" + "ADD r3, r3, r7\n\t" + "BFI r3, r10, #0, #16\n\t" + "ADD r10, r4, r8\n\t" + "BFC r8, #0, #16\n\t" + "ADD r4, r4, r8\n\t" + "BFI r4, r10, #0, #16\n\t" + "ADD r10, r5, r9\n\t" + "BFC r9, #0, #16\n\t" + "ADD r5, r5, r9\n\t" + "BFI r5, r10, #0, #16\n\t" +#endif /* !WOLFSSL_ARM_ARCH_7M */ + "STM %[p]!, {r2, r3, r4, r5}\n\t" + "SUBS r1, r1, #0x8\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_csubq_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_csubq_loop\n\t" +#else + "BNE.N L_kyber_thumb2_csubq_loop_%=\n\t" +#endif +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [p] "+r" (p), + [L_kyber_thumb2_basemul_mont_zetas] "+r" (L_kyber_thumb2_basemul_mont_zetas_c) + : + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#else + : [p] "+r" (p) + : [L_kyber_thumb2_basemul_mont_zetas] "r" (L_kyber_thumb2_basemul_mont_zetas) + : "memory", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "lr", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); +} + +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG +unsigned int kyber_thumb2_rej_uniform(sword16* p_p, unsigned int len_p, const byte* r_p, unsigned int rLen_p) +#else +unsigned int kyber_thumb2_rej_uniform(sword16* p, unsigned int len, const byte* r, unsigned int rLen) +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ +{ +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + register sword16* p __asm__ ("r0") = (sword16*)p_p; + register unsigned int len __asm__ ("r1") = (unsigned int)len_p; + register const byte* r __asm__ ("r2") = (const byte*)r_p; + register unsigned int rLen __asm__ ("r3") = (unsigned int)rLen_p; + register uint16_t* L_kyber_thumb2_basemul_mont_zetas_c __asm__ ("r4") = (uint16_t*)&L_kyber_thumb2_basemul_mont_zetas; +#endif /* !WOLFSSL_NO_VAR_ASSIGN_REG */ + + __asm__ __volatile__ ( + "MOV r8, #0xd01\n\t" + "MOV r9, #0x0\n\t" + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_loop_no_fail:\n\t" +#else + "L_kyber_thumb2_rej_uniform_loop_no_fail_%=:\n\t" +#endif + "CMP %[len], #0x8\n\t" +#if defined(__GNUC__) + "BLT L_kyber_thumb2_rej_uniform_done_no_fail_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BLT.N L_kyber_thumb2_rej_uniform_done_no_fail\n\t" +#else + "BLT.N L_kyber_thumb2_rej_uniform_done_no_fail_%=\n\t" +#endif + "LDM %[r]!, {r4, r5, r6}\n\t" + "UBFX r7, r4, #0, #12\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "UBFX r7, r4, #12, #12\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "UBFX r7, r4, #24, #8\n\t" + "BFI r7, r5, #8, #4\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "UBFX r7, r5, #4, #12\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "UBFX r7, r5, #16, #12\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "UBFX r7, r5, #28, #4\n\t" + "BFI r7, r6, #4, #8\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "UBFX r7, r6, #8, #12\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "UBFX r7, r6, #20, #12\n\t" + "STRH r7, [%[p], r9]\n\t" + "SUB r10, r7, r8\n\t" + "LSR r10, r10, #31\n\t" + "SUB %[len], %[len], r10\n\t" + "ADD r9, r9, r10, LSL #1\n\t" + "SUBS %[rLen], %[rLen], #0xc\n\t" +#if defined(__GNUC__) + "BNE L_kyber_thumb2_rej_uniform_loop_no_fail_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BNE.N L_kyber_thumb2_rej_uniform_loop_no_fail\n\t" +#else + "BNE.N L_kyber_thumb2_rej_uniform_loop_no_fail_%=\n\t" +#endif +#if defined(__GNUC__) + "B L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "B.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "B.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_done_no_fail:\n\t" +#else + "L_kyber_thumb2_rej_uniform_done_no_fail_%=:\n\t" +#endif + "CMP %[len], #0x0\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_loop:\n\t" +#else + "L_kyber_thumb2_rej_uniform_loop_%=:\n\t" +#endif + "LDM %[r]!, {r4, r5, r6}\n\t" + "UBFX r7, r4, #0, #12\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_0_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_0\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_0_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_0:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_0_%=:\n\t" +#endif + "UBFX r7, r4, #12, #12\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_1_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_1\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_1_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_1:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_1_%=:\n\t" +#endif + "UBFX r7, r4, #24, #8\n\t" + "BFI r7, r5, #8, #4\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_2_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_2\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_2_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_2:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_2_%=:\n\t" +#endif + "UBFX r7, r5, #4, #12\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_3_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_3\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_3_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_3:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_3_%=:\n\t" +#endif + "UBFX r7, r5, #16, #12\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_4_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_4\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_4_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_4:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_4_%=:\n\t" +#endif + "UBFX r7, r5, #28, #4\n\t" + "BFI r7, r6, #4, #8\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_5_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_5\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_5_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_5:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_5_%=:\n\t" +#endif + "UBFX r7, r6, #8, #12\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_6_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_6\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_6_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_6:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_6_%=:\n\t" +#endif + "UBFX r7, r6, #20, #12\n\t" + "CMP r7, r8\n\t" +#if defined(__GNUC__) + "BGE L_kyber_thumb2_rej_uniform_fail_7_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGE.N L_kyber_thumb2_rej_uniform_fail_7\n\t" +#else + "BGE.N L_kyber_thumb2_rej_uniform_fail_7_%=\n\t" +#endif + "STRH r7, [%[p], r9]\n\t" + "SUBS %[len], %[len], #0x1\n\t" + "ADD r9, r9, #0x2\n\t" +#if defined(__GNUC__) + "BEQ L_kyber_thumb2_rej_uniform_done_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BEQ.N L_kyber_thumb2_rej_uniform_done\n\t" +#else + "BEQ.N L_kyber_thumb2_rej_uniform_done_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_fail_7:\n\t" +#else + "L_kyber_thumb2_rej_uniform_fail_7_%=:\n\t" +#endif + "SUBS %[rLen], %[rLen], #0xc\n\t" +#if defined(__GNUC__) + "BGT L_kyber_thumb2_rej_uniform_loop_%=\n\t" +#elif defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "BGT.N L_kyber_thumb2_rej_uniform_loop\n\t" +#else + "BGT.N L_kyber_thumb2_rej_uniform_loop_%=\n\t" +#endif + "\n" +#if defined(__IAR_SYSTEMS_ICC__) && (__VER__ < 9000000) + "L_kyber_thumb2_rej_uniform_done:\n\t" +#else + "L_kyber_thumb2_rej_uniform_done_%=:\n\t" +#endif + "LSR r0, r9, #1\n\t" +#ifndef WOLFSSL_NO_VAR_ASSIGN_REG + : [p] "+r" (p), [len] "+r" (len), [r] "+r" (r), [rLen] "+r" (rLen), + [L_kyber_thumb2_basemul_mont_zetas] "+r" (L_kyber_thumb2_basemul_mont_zetas_c) + : + : "memory", "r5", "r6", "r7", "r8", "r9", "r10", "cc" +#else + : [p] "+r" (p), [len] "+r" (len), [r] "+r" (r), [rLen] "+r" (rLen) + : [L_kyber_thumb2_basemul_mont_zetas] "r" (L_kyber_thumb2_basemul_mont_zetas) + : "memory", "r5", "r6", "r7", "r8", "r9", "r10", "cc" +#endif /* WOLFSSL_NO_VAR_ASSIGN_REG */ + ); + return (uint32_t)(size_t)p; +} + +#endif /* WOLFSSL_WC_KYBER */ +#endif /* !__aarch64__ && __thumb__ */ +#endif /* WOLFSSL_ARMASM */ +#endif /* WOLFSSL_ARMASM_INLINE */ diff --git a/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S b/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S index b727e8164..c1aec82bf 100644 --- a/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S +++ b/wolfcrypt/src/port/arm/thumb2-poly1305-asm.S @@ -67,17 +67,17 @@ L_poly1305_thumb2_16_loop: ADCS r7, r7, r10 ADD r1, r1, #0x10 ADC r8, r8, r11 -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M STM lr, {r4, r5, r6, r7, r8} #else /* h[0]-h[2] in r4-r6 for multiplication. */ STR r7, [lr, #12] STR r8, [lr, #16] -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ STR r1, [sp, #16] LDR r1, [sp, #12] /* Multiply h by r */ -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ LDR r3, [r1] EOR r0, r0, r0 @@ -218,7 +218,7 @@ L_poly1305_thumb2_16_loop: UMAAL r11, r12, r3, r5 /* DONE */ LDM sp, {r4, r5, r6} -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ /* r12 will be zero because r is masked. */ /* Load length */ LDR r2, [sp, #20] diff --git a/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c b/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c index 437141ab0..acf82c4a8 100644 --- a/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c +++ b/wolfcrypt/src/port/arm/thumb2-poly1305-asm_c.c @@ -93,17 +93,17 @@ void poly1305_blocks_thumb2_16(Poly1305* ctx, const byte* m, word32 len, int not "ADCS r7, r7, r10\n\t" "ADD %[m], %[m], #0x10\n\t" "ADC r8, r8, r11\n\t" -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M "STM lr, {r4, r5, r6, r7, r8}\n\t" #else /* h[0]-h[2] in r4-r6 for multiplication. */ "STR r7, [lr, #12]\n\t" "STR r8, [lr, #16]\n\t" -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ "STR %[m], [sp, #16]\n\t" "LDR %[m], [sp, #12]\n\t" /* Multiply h by r */ -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M /* r0 = #0, r1 = r, lr = h, r2 = h[j], r3 = r[i] */ "LDR %[notLast], [%[m]]\n\t" "EOR %[ctx], %[ctx], %[ctx]\n\t" @@ -244,7 +244,7 @@ void poly1305_blocks_thumb2_16(Poly1305* ctx, const byte* m, word32 len, int not "UMAAL r11, r12, %[notLast], r5\n\t" /* DONE */ "LDM sp, {r4, r5, r6}\n\t" -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ /* r12 will be zero because r is masked. */ /* Load length */ "LDR %[len], [sp, #20]\n\t" diff --git a/wolfcrypt/src/sp_cortexm.c b/wolfcrypt/src/sp_cortexm.c index 343f69d69..25404a718 100644 --- a/wolfcrypt/src/sp_cortexm.c +++ b/wolfcrypt/src/sp_cortexm.c @@ -240,7 +240,7 @@ static void sp_2048_to_bin_64(sp_digit* r, byte* a) #define sp_2048_norm_64(a) #ifndef WOLFSSL_SP_SMALL -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M /* Multiply a and b into r. (r = a * b) * * r A single precision integer. @@ -736,7 +736,7 @@ SP_NOINLINE static void sp_2048_mul_8(sp_digit* r, const sp_digit* a, const sp_d ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ /* Add b to a into r. (r = a + b) * * r A single precision integer. @@ -1533,7 +1533,7 @@ SP_NOINLINE static void sp_2048_mul_64(sp_digit* r, const sp_digit* a, (void)sp_2048_add_32(r + 96, r + 96, a1); } -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -1899,7 +1899,7 @@ SP_NOINLINE static void sp_2048_sqr_8(sp_digit* r, const sp_digit* a) ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ /* Sub b from a into r. (r = a - b) * * r A single precision integer. @@ -31605,7 +31605,7 @@ static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b) } #else -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M /* Multiply a and b into r. (r = a * b) * * r A single precision integer. @@ -32101,7 +32101,7 @@ SP_NOINLINE static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_di ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Square a and put result in r. (r = a * a) @@ -32222,7 +32222,7 @@ static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) } #else -#ifdef WOLFSSL_SP_NO_UMAAL +#ifdef WOLFSSL_ARM_ARCH_7M /* Square a and put result in r. (r = a * a) * * r A single precision integer. @@ -32588,7 +32588,7 @@ SP_NOINLINE static void sp_256_sqr_8(sp_digit* r, const sp_digit* a) ); } -#endif /* WOLFSSL_SP_NO_UMAAL */ +#endif /* WOLFSSL_ARM_ARCH_7M */ #endif /* WOLFSSL_SP_SMALL */ #ifdef WOLFSSL_SP_SMALL /* Add b to a into r. (r = a + b) diff --git a/wolfcrypt/src/wc_kyber_poly.c b/wolfcrypt/src/wc_kyber_poly.c index 4514ad317..8c8c97dd8 100644 --- a/wolfcrypt/src/wc_kyber_poly.c +++ b/wolfcrypt/src/wc_kyber_poly.c @@ -173,8 +173,16 @@ const sword16 zetas_inv[KYBER_N / 2] = { 3127, 3042, 1907, 1836, 1517, 359, 758, 1441 }; +#define KYBER_BARRETT(a) \ + "SMULWB r10, r14, " #a "\n\t" \ + "SMULWT r11, r14, " #a "\n\t" \ + "SMULBT r10, r12, r10\n\t" \ + "SMULBT r11, r12, r11\n\t" \ + "PKHBT r10, r10, r11, LSL #16\n\t" \ + "SSUB16 " #a ", " #a ", r10\n\t" -#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) + +#if !(defined(__thumb__) || (defined(__aarch64__)) && defined(WOLFSSL_ARMASM)) /* Number-Theoretic Transform. * * @param [in, out] r Polynomial to transform. @@ -939,15 +947,16 @@ static void kyber_basemul(sword16* r, const sword16* a, const sword16* b, */ static void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b) { - unsigned int i; const sword16* zeta = zetas + 64; -#ifdef WOLFSSL_KYBER_SMALL +#if defined(WOLFSSL_KYBER_SMALL) + unsigned int i; for (i = 0; i < KYBER_N; i += 4, zeta++) { kyber_basemul(r + i + 0, a + i + 0, b + i + 0, zeta[0]); kyber_basemul(r + i + 2, a + i + 2, b + i + 2, -zeta[0]); } #elif defined(WOLFSSL_KYBER_NO_LARGE_CODE) + unsigned int i; for (i = 0; i < KYBER_N; i += 8, zeta += 2) { kyber_basemul(r + i + 0, a + i + 0, b + i + 0, zeta[0]); kyber_basemul(r + i + 2, a + i + 2, b + i + 2, -zeta[0]); @@ -955,6 +964,7 @@ static void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b) kyber_basemul(r + i + 6, a + i + 6, b + i + 6, -zeta[1]); } #else + unsigned int i; for (i = 0; i < KYBER_N; i += 16, zeta += 4) { kyber_basemul(r + i + 0, a + i + 0, b + i + 0, zeta[0]); kyber_basemul(r + i + 2, a + i + 2, b + i + 2, -zeta[0]); @@ -977,10 +987,10 @@ static void kyber_basemul_mont(sword16* r, const sword16* a, const sword16* b) static void kyber_basemul_mont_add(sword16* r, const sword16* a, const sword16* b) { - unsigned int i; const sword16* zeta = zetas + 64; -#ifdef WOLFSSL_KYBER_SMALL +#if defined(WOLFSSL_KYBER_SMALL) + unsigned int i; for (i = 0; i < KYBER_N; i += 4, zeta++) { sword16 t0[2]; sword16 t2[2]; @@ -994,6 +1004,7 @@ static void kyber_basemul_mont_add(sword16* r, const sword16* a, r[i + 3] += t2[1]; } #elif defined(WOLFSSL_KYBER_NO_LARGE_CODE) + unsigned int i; for (i = 0; i < KYBER_N; i += 8, zeta += 2) { sword16 t0[2]; sword16 t2[2]; @@ -1015,6 +1026,7 @@ static void kyber_basemul_mont_add(sword16* r, const sword16* a, r[i + 7] += t6[1]; } #else + unsigned int i; for (i = 0; i < KYBER_N; i += 16, zeta += 4) { sword16 t0[2]; sword16 t2[2]; @@ -2142,7 +2154,7 @@ int kyber_kdf(byte* seed, int seedLen, byte* out, int outLen) } #endif -#if !(defined(WOLFSSL_ARMASM) && defined(__aarch64__)) +#if !(defined(WOLFSSL_ARMASM) && (defined(__aarch64__) || defined(__thumb__))) /* Rejection sampling on uniform random bytes to generate uniform random * integers mod q. * @@ -3338,7 +3350,7 @@ int kyber_cmp(const byte* a, const byte* b, int sz) /******************************************************************************/ -#if !(defined(__aarch64__) && defined(WOLFSSL_ARMASM)) +#if !(defined(__thumb__) || (defined(__aarch64__)) && defined(WOLFSSL_ARMASM)) /* Conditional subtraction of q to each coefficient of a polynomial. * @@ -3355,10 +3367,14 @@ static KYBER_NOINLINE void kyber_csubq_c(sword16* p) } } -#else +#elif defined(__aarch64__) #define kyber_csubq_c kyber_csubq_neon +#else + +#define kyber_csubq_c kyber_thumb2_csubq + #endif /******************************************************************************/ diff --git a/wolfssl/wolfcrypt/wc_kyber.h b/wolfssl/wolfcrypt/wc_kyber.h index 2b8ac8da2..354fc8b01 100644 --- a/wolfssl/wolfcrypt/wc_kyber.h +++ b/wolfssl/wolfcrypt/wc_kyber.h @@ -310,6 +310,22 @@ WOLFSSL_LOCAL int kyber_cmp_neon(const byte* a, const byte* b, int sz); WOLFSSL_LOCAL void kyber_csubq_neon(sword16* p); WOLFSSL_LOCAL void kyber_from_msg_neon(sword16* p, const byte* msg); WOLFSSL_LOCAL void kyber_to_msg_neon(byte* msg, sword16* p); +#elif defined(__thumb__) && defined(WOLFSSL_ARMASM) +#define kyber_ntt kyber_thumb2_ntt +#define kyber_invntt kyber_thumb2_invntt +#define kyber_basemul_mont kyber_thumb2_basemul_mont +#define kyber_basemul_mont_add kyber_thumb2_basemul_mont_add +#define kyber_rej_uniform_c kyber_thumb2_rej_uniform + +WOLFSSL_LOCAL void kyber_thumb2_ntt(sword16* r); +WOLFSSL_LOCAL void kyber_thumb2_invntt(sword16* r); +WOLFSSL_LOCAL void kyber_thumb2_basemul_mont(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_thumb2_basemul_mont_add(sword16* r, const sword16* a, + const sword16* b); +WOLFSSL_LOCAL void kyber_thumb2_csubq(sword16* p); +WOLFSSL_LOCAL unsigned int kyber_thumb2_rej_uniform(sword16* p, + unsigned int len, const byte* r, unsigned int rLen); #endif #ifdef __cplusplus