diff --git a/wolfcrypt/src/port/arm/armv8-aes.c b/wolfcrypt/src/port/arm/armv8-aes.c index ecb0d1605..989e65bd2 100644 --- a/wolfcrypt/src/port/arm/armv8-aes.c +++ b/wolfcrypt/src/port/arm/armv8-aes.c @@ -116,6 +116,7 @@ static const byte rcon[] = { #ifdef HAVE_AESGCM +#if !defined(__aarch64__) || defined(WOLFSSL_AESGCM_STREAM) static WC_INLINE void IncrementGcmCounter(byte* inOutCtr) { int i; @@ -144,6 +145,7 @@ static WC_INLINE void FlattenSzInBits(byte* buf, word32 sz) buf[6] = (sz >> 8) & 0xff; buf[7] = sz & 0xff; } +#endif #endif /* HAVE_AESGCM */ @@ -1467,1114 +1469,5714 @@ int wc_AesCtrSetKey(Aes* aes, const byte* key, word32 len, * Based from GCM implementation in wolfcrypt/src/aes.c */ +/* START script replace AES-GCM Aarch64 with hardware crypto. */ + /* PMULL and RBIT only with AArch64 */ /* Use ARM hardware for polynomial multiply */ void GMULT(byte* X, byte* Y) { __asm__ volatile ( - "LD1 {v0.16b}, [%[inX]] \n" - "LD1 {v1.16b}, [%[inY]] \n" /* v1 already reflected from set key */ + "LD1 {v0.16b}, [%[X]] \n" + "LD1 {v1.16b}, [%[Y]] \n" /* v1 already reflected from set key */ + "MOVI v2.16b, #0x87 \n" "RBIT v0.16b, v0.16b \n" + "USHR v2.2d, v2.2d, #56 \n" + "PMULL v3.1q, v0.1d, v1.1d \n" + "PMULL2 v4.1q, v0.2d, v1.2d \n" + "EXT v5.16b, v1.16b, v1.16b, #8 \n" + "PMULL v6.1q, v0.1d, v5.1d \n" + "PMULL2 v5.1q, v0.2d, v5.2d \n" + "EOR v5.16b, v5.16b, v6.16b \n" + "EXT v6.16b, v3.16b, v4.16b, #8 \n" + "EOR v6.16b, v6.16b, v5.16b \n" + "# Reduce \n" + "PMULL2 v5.1q, v4.2d, v2.2d \n" + "EOR v6.16b, v6.16b, v5.16b \n" + "PMULL2 v5.1q, v6.2d, v2.2d \n" + "MOV v3.D[1], v6.D[0] \n" + "EOR v0.16b, v3.16b, v5.16b \n" - /* Algorithm 1 from Intel GCM white paper. - "Carry-Less Multiplication and Its Usage for Computing the GCM Mode" - */ - "PMULL v3.1q, v0.1d, v1.1d \n" /* a0 * b0 = C */ - "PMULL2 v4.1q, v0.2d, v1.2d \n" /* a1 * b1 = D */ - "EXT v5.16b, v1.16b, v1.16b, #8 \n" /* b0b1 -> b1b0 */ - "PMULL v6.1q, v0.1d, v5.1d \n" /* a0 * b1 = E */ - "PMULL2 v5.1q, v0.2d, v5.2d \n" /* a1 * b0 = F */ - - "#Set a register to all 0s using EOR \n" - "EOR v7.16b, v7.16b, v7.16b \n" - "EOR v5.16b, v5.16b, v6.16b \n" /* F ^ E */ - "EXT v6.16b, v7.16b, v5.16b, #8 \n" /* get (F^E)[0] */ - "EOR v3.16b, v3.16b, v6.16b \n" /* low 128 bits in v3 */ - "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* get (F^E)[1] */ - "EOR v4.16b, v4.16b, v6.16b \n" /* high 128 bits in v4 */ - - - /* Based from White Paper "Implementing GCM on ARMv8" - by Conrado P.L. Gouvea and Julio Lopez - reduction on 256bit value using Algorithm 5 */ - "MOVI v8.16b, #0x87 \n" - "USHR v8.2d, v8.2d, #56 \n" - /* v8 is now 0x00000000000000870000000000000087 reflected 0xe1....*/ - "PMULL2 v5.1q, v4.2d, v8.2d \n" - "EXT v6.16b, v5.16b, v7.16b, #8 \n" /* v7 is all 0's */ - "EOR v4.16b, v4.16b, v6.16b \n" - "EXT v6.16b, v7.16b, v5.16b, #8 \n" - "EOR v3.16b, v3.16b, v6.16b \n" - "PMULL v5.1q, v4.1d, v8.1d \n" - "EOR v4.16b, v3.16b, v5.16b \n" - - "RBIT v4.16b, v4.16b \n" - "STR q4, [%[out]] \n" - : [out] "=r" (X), "=r" (Y) - : [inX] "0" (X), [inY] "1" (Y) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8" + "RBIT v0.16b, v0.16b \n" + "STR q0, [%[X]] \n" + : + : [X] "r" (X), [Y] "r" (Y) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } - void GHASH(Gcm* gcm, const byte* a, word32 aSz, const byte* c, word32 cSz, byte* s, word32 sSz) { - byte x[AES_BLOCK_SIZE]; byte scratch[AES_BLOCK_SIZE]; - word32 blocks, partial; - byte* h; - if (gcm == NULL) { - return; - } + __asm__ __volatile__ ( + "LD1 {v3.16b}, %[h] \n" + "MOVI v7.16b, #0x87 \n" + "EOR v0.16b, v0.16b, v0.16b \n" + "USHR v7.2d, v7.2d, #56 \n" - h = gcm->H; - XMEMSET(x, 0, AES_BLOCK_SIZE); + "# AAD \n" + "CBZ %w[aSz], 20f \n" + "MOV w12, %w[aSz] \n" - /* Hash in A, the Additional Authentication Data */ - if (aSz != 0 && a != NULL) { - blocks = aSz / AES_BLOCK_SIZE; - partial = aSz % AES_BLOCK_SIZE; - /* do as many blocks as possible */ - while (blocks--) { - xorbuf(x, a, AES_BLOCK_SIZE); - GMULT(x, h); - a += AES_BLOCK_SIZE; - } - if (partial != 0) { - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, a, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, h); - } - } + "CMP x12, #64 \n" + "BLT 15f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v11.1q, v3.2d, v3.2d \n" + "PMULL v10.1q, v3.1d, v3.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v4.16b, v10.16b, v11.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v10.1q, v4.1d, v3.1d \n" + "PMULL2 v11.1q, v4.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v4.1d, v12.1d \n" + "PMULL2 v12.1q, v4.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v5.16b, v10.16b, v12.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v11.1q, v4.2d, v4.2d \n" + "PMULL v10.1q, v4.1d, v4.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v6.16b, v10.16b, v11.16b \n" + "14: \n" + "LD1 {v10.2d-v13.2d}, [%[a]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v10.16b, v10.16b \n" + "RBIT v11.16b, v11.16b \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "EOR v10.16b, v10.16b, v0.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v0.1q, v13.1d, v3.1d \n" + "PMULL2 v1.1q, v13.2d, v3.2d \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v2.1q, v13.1d, v3.1d \n" + "PMULL2 v9.1q, v13.2d, v3.2d \n" + "EOR v2.16b, v2.16b, v9.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v8.1q, v12.1d, v4.1d \n" + "PMULL2 v9.1q, v12.2d, v4.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v9.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v2.16b, v2.16b, v12.16b, v9.16b \n" +#else + "EOR v12.16b, v12.16b, v9.16b \n" + "EOR v2.16b, v2.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v8.1q, v11.1d, v5.1d \n" + "PMULL2 v9.1q, v11.2d, v5.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v11.16b, v11.16b, v11.16b, #8 \n" + "PMULL v9.1q, v11.1d, v5.1d \n" + "PMULL2 v11.1q, v11.2d, v5.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v2.16b, v2.16b, v11.16b, v9.16b \n" +#else + "EOR v11.16b, v11.16b, v9.16b \n" + "EOR v2.16b, v2.16b, v11.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v8.1q, v10.1d, v6.1d \n" + "PMULL2 v9.1q, v10.2d, v6.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v10.16b, v10.16b, v10.16b, #8 \n" + "PMULL v9.1q, v10.1d, v6.1d \n" + "PMULL2 v10.1q, v10.2d, v6.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v2.16b, v2.16b, v10.16b, v9.16b \n" +#else + "EOR v10.16b, v10.16b, v9.16b \n" + "EOR v2.16b, v2.16b, v10.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v9.16b, v0.16b, v1.16b, #8 \n" + "PMULL2 v8.1q, v1.2d, v7.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v9.16b, v9.16b, v2.16b, v8.16b \n" +#else + "EOR v9.16b, v9.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v9.16b, v9.16b, v8.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v8.1q, v9.2d, v7.2d \n" + "MOV v0.D[1], v9.D[0] \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "CMP x12, #64 \n" + "BGE 14b \n" + "CBZ x12, 20f \n" + "15: \n" + "CMP x12, #16 \n" + "BLT 12f \n" + "11: \n" + "LD1 {v14.2d}, [%[a]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" + "CMP x12, #16 \n" + "BGE 11b \n" + "CBZ x12, 120f \n" + "12: \n" + "# Partial AAD \n" + "EOR v14.16b, v14.16b, v14.16b \n" + "MOV x14, x12 \n" + "ST1 {v14.2d}, [%[scratch]] \n" + "13: \n" + "LDRB w13, [%[a]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 13b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v14.2d}, [%[scratch]] \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" - /* Hash in C, the Ciphertext */ - if (cSz != 0 && c != NULL) { - blocks = cSz / AES_BLOCK_SIZE; - partial = cSz % AES_BLOCK_SIZE; - while (blocks--) { - xorbuf(x, c, AES_BLOCK_SIZE); - GMULT(x, h); - c += AES_BLOCK_SIZE; - } - if (partial != 0) { - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, c, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, h); - } - } + "20: \n" + "# Cipher Text \n" + "CBZ %w[cSz], 120f \n" + "MOV w12, %w[cSz] \n" - /* Hash in the lengths of A and C in bits */ - FlattenSzInBits(&scratch[0], aSz); - FlattenSzInBits(&scratch[8], cSz); - xorbuf(x, scratch, AES_BLOCK_SIZE); + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v11.1q, v3.2d, v3.2d \n" + "PMULL v10.1q, v3.1d, v3.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v4.16b, v10.16b, v11.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v10.1q, v4.1d, v3.1d \n" + "PMULL2 v11.1q, v4.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v4.1d, v12.1d \n" + "PMULL2 v12.1q, v4.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v5.16b, v10.16b, v12.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v11.1q, v4.2d, v4.2d \n" + "PMULL v10.1q, v4.1d, v4.1d \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v11.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v6.16b, v10.16b, v11.16b \n" + "114: \n" + "LD1 {v10.2d-v13.2d}, [%[c]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v10.16b, v10.16b \n" + "RBIT v11.16b, v11.16b \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "EOR v10.16b, v10.16b, v0.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v0.1q, v13.1d, v3.1d \n" + "PMULL2 v1.1q, v13.2d, v3.2d \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v2.1q, v13.1d, v3.1d \n" + "PMULL2 v9.1q, v13.2d, v3.2d \n" + "EOR v2.16b, v2.16b, v9.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v8.1q, v12.1d, v4.1d \n" + "PMULL2 v9.1q, v12.2d, v4.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v9.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v2.16b, v2.16b, v12.16b, v9.16b \n" +#else + "EOR v12.16b, v12.16b, v9.16b \n" + "EOR v2.16b, v2.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v8.1q, v11.1d, v5.1d \n" + "PMULL2 v9.1q, v11.2d, v5.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v11.16b, v11.16b, v11.16b, #8 \n" + "PMULL v9.1q, v11.1d, v5.1d \n" + "PMULL2 v11.1q, v11.2d, v5.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v2.16b, v2.16b, v11.16b, v9.16b \n" +#else + "EOR v11.16b, v11.16b, v9.16b \n" + "EOR v2.16b, v2.16b, v11.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v8.1q, v10.1d, v6.1d \n" + "PMULL2 v9.1q, v10.2d, v6.2d \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "EOR v1.16b, v1.16b, v9.16b \n" + "EXT v10.16b, v10.16b, v10.16b, #8 \n" + "PMULL v9.1q, v10.1d, v6.1d \n" + "PMULL2 v10.1q, v10.2d, v6.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v2.16b, v2.16b, v10.16b, v9.16b \n" +#else + "EOR v10.16b, v10.16b, v9.16b \n" + "EOR v2.16b, v2.16b, v10.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v9.16b, v0.16b, v1.16b, #8 \n" + "PMULL2 v8.1q, v1.2d, v7.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v9.16b, v9.16b, v2.16b, v8.16b \n" +#else + "EOR v9.16b, v9.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v9.16b, v9.16b, v8.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v8.1q, v9.2d, v7.2d \n" + "MOV v0.D[1], v9.D[0] \n" + "EOR v0.16b, v0.16b, v8.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v14.2d}, [%[c]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial cipher text \n" + "EOR v14.16b, v14.16b, v14.16b \n" + "MOV x14, x12 \n" + "ST1 {v14.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[c]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v14.2d}, [%[scratch]] \n" + "RBIT v14.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v14.16b \n" + "PMULL v10.1q, v0.1d, v3.1d \n" + "PMULL2 v11.1q, v0.2d, v3.2d \n" + "EXT v12.16b, v3.16b, v3.16b, #8 \n" + "PMULL v13.1q, v0.1d, v12.1d \n" + "PMULL2 v12.1q, v0.2d, v12.2d \n" + "EOR v12.16b, v12.16b, v13.16b \n" + "EXT v13.16b, v10.16b, v11.16b, #8 \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "# Reduce \n" + "PMULL2 v12.1q, v11.2d, v7.2d \n" + "EOR v13.16b, v13.16b, v12.16b \n" + "PMULL2 v12.1q, v13.2d, v7.2d \n" + "MOV v10.D[1], v13.D[0] \n" + "EOR v0.16b, v10.16b, v12.16b \n" + "120: \n" + "RBIT v0.16b, v0.16b \n" + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[cSz], %x[cSz], #3 \n" + "MOV v10.D[0], %x[aSz] \n" + "MOV v10.D[1], %x[cSz] \n" + "REV64 v10.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v10.16b \n" + "ST1 {v0.16b}, [%[scratch]] \n" + : [cSz] "+r" (cSz), [c] "+r" (c), [aSz] "+r" (aSz), [a] "+r" (a) + : [scratch] "r" (scratch), [h] "m" (gcm->H) + : "cc", "memory", "w12", "w13", "x14", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14" + ); - /* Copy the result (minus last GMULT) into s. */ - XMEMCPY(s, x, sSz); + XMEMCPY(s, scratch, sSz); } - #ifdef WOLFSSL_AES_128 /* internal function : see wc_AesGcmEncrypt */ static int Aes128GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) + const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) { - word32 blocks; - word32 partial; byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte x[AES_BLOCK_SIZE]; byte scratch[AES_BLOCK_SIZE]; - /* Noticed different optimization levels treated head of array different. - Some cases was stack pointer plus offset others was a regester containing - address. To make uniform for passing in to inline assembly code am using - pointers to the head of each local array. + * Some cases was stack pointer plus offset others was a regester containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. */ byte* ctr = counter; - byte* iCtr = initialCounter; - byte* xPt = x; - byte* sPt = scratch; - byte* keyPt; /* pointer to handle pointer advencment */ + byte* keyPt = (byte*)aes->key; - XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + XMEMSET(counter, 0, AES_BLOCK_SIZE); if (ivSz == GCM_NONCE_MID_SZ) { - XMEMCPY(initialCounter, iv, ivSz); - initialCounter[AES_BLOCK_SIZE - 1] = 1; + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); - GMULT(initialCounter, aes->gcm.H); - } - XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE); - - - /* Hash in the Additional Authentication Data */ - XMEMSET(x, 0, AES_BLOCK_SIZE); - if (authInSz != 0 && authIn != NULL) { - blocks = authInSz / AES_BLOCK_SIZE; - partial = authInSz % AES_BLOCK_SIZE; - /* do as many blocks as possible */ - while (blocks--) { - xorbuf(x, authIn, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - authIn += AES_BLOCK_SIZE; - } - if (partial != 0) { - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, authIn, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - } + GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT(counter, aes->gcm.H); } - /* do as many blocks as possible */ - blocks = sz / AES_BLOCK_SIZE; - partial = sz % AES_BLOCK_SIZE; - if (blocks > 0) { - keyPt = (byte*)aes->key; - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v13.2d}, [%[ctr]] \n" - - "#Create vector with the value 1 \n" - "MOVI v14.16b, #1 \n" - "USHR v14.2d, v14.2d, #56 \n" - "EOR v22.16b, v22.16b, v22.16b \n" - "EXT v14.16b, v14.16b, v22.16b, #8\n" - - - /*************************************************** - Get first out block for GHASH using AES encrypt - ***************************************************/ - "REV64 v13.16b, v13.16b \n" /* network order */ - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "ADD v13.4s, v13.4s, v14.4s \n" /* add 1 to counter */ - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "REV64 v13.16b, v13.16b \n" /* revert from network order */ - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "MOV v0.16b, v13.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v16.2d}, %[inY] \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "SUB w11, w11, #1 \n" - "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "MOVI v23.16b, #0x87 \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */ - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "USHR v23.2d, v23.2d, #56 \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n" - - "EOR v0.16b, v0.16b, v12.16b \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "MOV v15.16b, v0.16b \n" - - "CBZ w11, 1f \n" /* only one block jump to final GHASH */ - - "LD1 {v12.2d}, [%[input]], #16 \n" - - /*************************************************** - Interweave GHASH and encrypt if more then 1 block - ***************************************************/ - "2: \n" - "REV64 v13.16b, v13.16b \n" /* network order */ - "EOR v15.16b, v17.16b, v15.16b \n" - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "ADD v13.4s, v13.4s, v14.4s \n" /* add 1 to counter */ - "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */ - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "REV64 v13.16b, v13.16b \n" /* revert from network order */ - "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ - "MOV v0.16b, v13.16b \n" - "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "SUB w11, w11, #1 \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" - "AESE v0.16b, v10.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "EOR v0.16b, v0.16b, v11.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" - - "EOR v0.16b, v0.16b, v12.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "MOV v15.16b, v0.16b \n" - "RBIT v17.16b, v19.16b \n" - - "CBZ w11, 1f \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "B 2b \n" - - /*************************************************** - GHASH on last block - ***************************************************/ - "1: \n" - "EOR v15.16b, v17.16b, v15.16b \n" - "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */ - - "#store current AES counter value \n" - "ST1 {v13.2d}, [%[ctrOut]] \n" - "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ - "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ - "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ - "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ - - "#Reduce product from multiplication \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ - "EOR v19.16b, v19.16b, v21.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "EOR v18.16b, v18.16b, v21.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "RBIT v17.16b, v19.16b \n" - "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */ - - :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in) - ,[xOut] "=r" (xPt),"=m" (aes->gcm.H) - :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), - [input] "3" (in) - ,[inX] "4" (xPt), [inY] "m" (aes->gcm.H) - : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" - ); - } - - /* take care of partial block sizes leftover */ - if (partial != 0) { - IncrementGcmCounter(counter); - wc_AesEncrypt(aes, counter, scratch); - xorbuf(scratch, in, partial); - XMEMCPY(out, scratch, partial); - - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, out, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - } - - /* Hash in the lengths of A and C in bits */ - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - FlattenSzInBits(&scratch[0], authInSz); - FlattenSzInBits(&scratch[8], sz); - xorbuf(x, scratch, AES_BLOCK_SIZE); - XMEMCPY(scratch, x, AES_BLOCK_SIZE); - - keyPt = (byte*)aes->key; __asm__ __volatile__ ( - - "LD1 {v16.16b}, [%[tag]] \n" - "LD1 {v17.16b}, %[h] \n" - "RBIT v16.16b, v16.16b \n" - - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */ - "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */ - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */ - "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" - "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */ - "LD1 {v0.2d}, [%[ctr]] \n" - - "#Set a register to all 0s using EOR \n" - "EOR v22.16b, v22.16b, v22.16b \n" - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" "MOVI v23.16b, #0x87 \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v17.16b, v17.16b \n" "USHR v23.2d, v23.2d, #56 \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "AESE v0.16b, v10.16b \n" - "RBIT v19.16b, v19.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n" - "EOR v19.16b, v19.16b, v0.16b \n" - "STR q19, [%[out]] \n" + "CBZ %w[aSz], 120f \n" - :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr) - :[tag] "0" (sPt), [Key] "1" (keyPt), - [ctr] "2" (iCtr) , [h] "m" (aes->gcm.H) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14", - "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24" + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "w11", "w12", "w13", "w14", "w15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); - if (authTagSz > AES_BLOCK_SIZE) { - XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); - } - else { - /* authTagSz can be smaller than AES_BLOCK_SIZE */ - XMEMCPY(authTag, scratch, authTagSz); - } return 0; } #endif /* WOLFSSL_AES_128 */ - #ifdef WOLFSSL_AES_192 /* internal function : see wc_AesGcmEncrypt */ static int Aes192GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) + const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) { - word32 blocks; - word32 partial; byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte x[AES_BLOCK_SIZE]; byte scratch[AES_BLOCK_SIZE]; - /* Noticed different optimization levels treated head of array different. - Some cases was stack pointer plus offset others was a regester containing - address. To make uniform for passing in to inline assembly code am using - pointers to the head of each local array. + * Some cases was stack pointer plus offset others was a regester containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. */ byte* ctr = counter; - byte* iCtr = initialCounter; - byte* xPt = x; - byte* sPt = scratch; - byte* keyPt; /* pointer to handle pointer advencment */ + byte* keyPt = (byte*)aes->key; - XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + XMEMSET(counter, 0, AES_BLOCK_SIZE); if (ivSz == GCM_NONCE_MID_SZ) { - XMEMCPY(initialCounter, iv, ivSz); - initialCounter[AES_BLOCK_SIZE - 1] = 1; + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); - GMULT(initialCounter, aes->gcm.H); - } - XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE); - - - /* Hash in the Additional Authentication Data */ - XMEMSET(x, 0, AES_BLOCK_SIZE); - if (authInSz != 0 && authIn != NULL) { - blocks = authInSz / AES_BLOCK_SIZE; - partial = authInSz % AES_BLOCK_SIZE; - /* do as many blocks as possible */ - while (blocks--) { - xorbuf(x, authIn, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - authIn += AES_BLOCK_SIZE; - } - if (partial != 0) { - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, authIn, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - } + GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT(counter, aes->gcm.H); } - /* do as many blocks as possible */ - blocks = sz / AES_BLOCK_SIZE; - partial = sz % AES_BLOCK_SIZE; - if (blocks > 0) { - keyPt = (byte*)aes->key; - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v13.2d}, [%[ctr]] \n" - - "#Create vector with the value 1 \n" - "MOVI v14.16b, #1 \n" - "USHR v14.2d, v14.2d, #56 \n" - "EOR v22.16b, v22.16b, v22.16b \n" - "EXT v14.16b, v14.16b, v22.16b, #8\n" - - - /*************************************************** - Get first out block for GHASH using AES encrypt - ***************************************************/ - "REV64 v13.16b, v13.16b \n" /* network order */ - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "ADD v13.4s, v13.4s, v14.4s \n" /* add 1 to counter */ - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "REV64 v13.16b, v13.16b \n" /* revert from network order */ - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "MOV v0.16b, v13.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v16.2d}, %[inY] \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "SUB w11, w11, #1 \n" - "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" - "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "MOVI v23.16b, #0x87 \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */ - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "USHR v23.2d, v23.2d, #56 \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v30.16b \n" - "EOR v0.16b, v0.16b, v31.16b \n" - - "EOR v0.16b, v0.16b, v12.16b \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "MOV v15.16b, v0.16b \n" - - "CBZ w11, 1f \n" /* only one block jump to final GHASH */ - "LD1 {v12.2d}, [%[input]], #16 \n" - - /*************************************************** - Interweave GHASH and encrypt if more then 1 block - ***************************************************/ - "2: \n" - "REV64 v13.16b, v13.16b \n" /* network order */ - "EOR v15.16b, v17.16b, v15.16b \n" - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "ADD v13.4s, v13.4s, v14.4s \n" /* add 1 to counter */ - "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */ - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "REV64 v13.16b, v13.16b \n" /* revert from network order */ - "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ - "MOV v0.16b, v13.16b \n" - "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "SUB w11, w11, #1 \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" - "AESE v0.16b, v30.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "EOR v0.16b, v0.16b, v31.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" - - "EOR v0.16b, v0.16b, v12.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "MOV v15.16b, v0.16b \n" - "RBIT v17.16b, v19.16b \n" - - "CBZ w11, 1f \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "B 2b \n" - - /*************************************************** - GHASH on last block - ***************************************************/ - "1: \n" - "EOR v15.16b, v17.16b, v15.16b \n" - "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */ - - "#store current AES counter value \n" - "ST1 {v13.2d}, [%[ctrOut]] \n" - "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ - "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ - "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ - "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ - - "#Reduce product from multiplication \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ - "EOR v19.16b, v19.16b, v21.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "EOR v18.16b, v18.16b, v21.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "RBIT v17.16b, v19.16b \n" - "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */ - - :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in) - ,[xOut] "=r" (xPt),"=m" (aes->gcm.H) - :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), - [input] "3" (in) - ,[inX] "4" (xPt), [inY] "m" (aes->gcm.H) - : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24","v25","v26","v27","v28","v29","v30","v31" - ); - } - - /* take care of partial block sizes leftover */ - if (partial != 0) { - IncrementGcmCounter(counter); - wc_AesEncrypt(aes, counter, scratch); - xorbuf(scratch, in, partial); - XMEMCPY(out, scratch, partial); - - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, out, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - } - - /* Hash in the lengths of A and C in bits */ - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - FlattenSzInBits(&scratch[0], authInSz); - FlattenSzInBits(&scratch[8], sz); - xorbuf(x, scratch, AES_BLOCK_SIZE); - XMEMCPY(scratch, x, AES_BLOCK_SIZE); - - keyPt = (byte*)aes->key; __asm__ __volatile__ ( - - "LD1 {v16.16b}, [%[tag]] \n" - "LD1 {v17.16b}, %[h] \n" - "RBIT v16.16b, v16.16b \n" - - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */ - "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */ - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */ - "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" - "LD1 {v30.2d-v31.2d}, [%[Key]], #32\n" - "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */ - "LD1 {v0.2d}, [%[ctr]] \n" - - "#Set a register to all 0s using EOR \n" - "EOR v22.16b, v22.16b, v22.16b \n" - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" "MOVI v23.16b, #0x87 \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v17.16b, v17.16b \n" "USHR v23.2d, v23.2d, #56 \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "AESE v0.16b, v30.16b \n" - "RBIT v19.16b, v19.16b \n" - "EOR v0.16b, v0.16b, v31.16b \n" - "EOR v19.16b, v19.16b, v0.16b \n" - "STR q19, [%[out]] \n" + "CBZ %w[aSz], 120f \n" - :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr) - :[tag] "0" (sPt), [Key] "1" (keyPt), - [ctr] "2" (iCtr) , [h] "m" (aes->gcm.H) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14", - "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23","v24" + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "w11", "w12", "w13", "w14", "w15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); - if (authTagSz > AES_BLOCK_SIZE) { - XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); - } - else { - /* authTagSz can be smaller than AES_BLOCK_SIZE */ - XMEMCPY(authTag, scratch, authTagSz); - } - return 0; } #endif /* WOLFSSL_AES_192 */ - #ifdef WOLFSSL_AES_256 /* internal function : see wc_AesGcmEncrypt */ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) + const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) { - word32 blocks; - word32 partial; byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte x[AES_BLOCK_SIZE]; byte scratch[AES_BLOCK_SIZE]; - /* Noticed different optimization levels treated head of array different. - Some cases was stack pointer plus offset others was a regester containing - address. To make uniform for passing in to inline assembly code am using - pointers to the head of each local array. + * Some cases was stack pointer plus offset others was a regester containing + * address. To make uniform for passing in to inline assembly code am using + * pointers to the head of each local array. */ byte* ctr = counter; - byte* iCtr = initialCounter; - byte* xPt = x; - byte* sPt = scratch; - byte* keyPt; /* pointer to handle pointer advencment */ + byte* keyPt = (byte*)aes->key; - XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); + XMEMSET(counter, 0, AES_BLOCK_SIZE); if (ivSz == GCM_NONCE_MID_SZ) { - XMEMCPY(initialCounter, iv, ivSz); - initialCounter[AES_BLOCK_SIZE - 1] = 1; + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[AES_BLOCK_SIZE - 1] = 1; } else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); - GMULT(initialCounter, aes->gcm.H); - } - XMEMCPY(counter, initialCounter, AES_BLOCK_SIZE); - - - /* Hash in the Additional Authentication Data */ - XMEMSET(x, 0, AES_BLOCK_SIZE); - if (authInSz != 0 && authIn != NULL) { - blocks = authInSz / AES_BLOCK_SIZE; - partial = authInSz % AES_BLOCK_SIZE; - /* do as many blocks as possible */ - while (blocks--) { - xorbuf(x, authIn, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - authIn += AES_BLOCK_SIZE; - } - if (partial != 0) { - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, authIn, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - } + GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT(counter, aes->gcm.H); } - /* do as many blocks as possible */ - blocks = sz / AES_BLOCK_SIZE; - partial = sz % AES_BLOCK_SIZE; - if (blocks > 0) { - keyPt = (byte*)aes->key; - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v13.2d}, [%[ctr]] \n" - - "#Create vector with the value 1 \n" - "MOVI v14.16b, #1 \n" - "USHR v14.2d, v14.2d, #56 \n" - "EOR v22.16b, v22.16b, v22.16b \n" - "EXT v14.16b, v14.16b, v22.16b, #8\n" - - - /*************************************************** - Get first out block for GHASH using AES encrypt - ***************************************************/ - "REV64 v13.16b, v13.16b \n" /* network order */ - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "ADD v13.4s, v13.4s, v14.4s \n" /* add 1 to counter */ - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "REV64 v13.16b, v13.16b \n" /* revert from network order */ - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "MOV v0.16b, v13.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v16.2d}, %[inY] \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "SUB w11, w11, #1 \n" - "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" - "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "MOVI v23.16b, #0x87 \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v17.2d}, [%[inX]] \n" /* account for additional data */ - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "USHR v23.2d, v23.2d, #56 \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v28.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v29.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v30.16b \n" - "EOR v0.16b, v0.16b, v31.16b \n" - - "EOR v0.16b, v0.16b, v12.16b \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "MOV v15.16b, v0.16b \n" - - "CBZ w11, 1f \n" /* only one block jump to final GHASH */ - "LD1 {v12.2d}, [%[input]], #16 \n" - - /*************************************************** - Interweave GHASH and encrypt if more then 1 block - ***************************************************/ - "2: \n" - "REV64 v13.16b, v13.16b \n" /* network order */ - "EOR v15.16b, v17.16b, v15.16b \n" - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "ADD v13.4s, v13.4s, v14.4s \n" /* add 1 to counter */ - "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block (c) */ - "EXT v13.16b, v13.16b, v13.16b, #8 \n" - "REV64 v13.16b, v13.16b \n" /* revert from network order */ - "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ - "MOV v0.16b, v13.16b \n" - "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "SUB w11, w11, #1 \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v28.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v29.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" - "AESE v0.16b, v30.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "EOR v0.16b, v0.16b, v31.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" - - "EOR v0.16b, v0.16b, v12.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "MOV v15.16b, v0.16b \n" - "RBIT v17.16b, v19.16b \n" - - "CBZ w11, 1f \n" - "LD1 {v12.2d}, [%[input]], #16 \n" - "B 2b \n" - - /*************************************************** - GHASH on last block - ***************************************************/ - "1: \n" - "EOR v15.16b, v17.16b, v15.16b \n" - "RBIT v15.16b, v15.16b \n" /* v15 is encrypted out block */ - - "#store current AES counter value \n" - "ST1 {v13.2d}, [%[ctrOut]] \n" - "PMULL v18.1q, v15.1d, v16.1d \n" /* a0 * b0 = C */ - "PMULL2 v19.1q, v15.2d, v16.2d \n" /* a1 * b1 = D */ - "EXT v20.16b, v16.16b, v16.16b, #8 \n" /* b0b1 -> b1b0 */ - "PMULL v21.1q, v15.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v15.2d, v20.2d \n" /* a1 * b0 = F */ - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ - - "#Reduce product from multiplication \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* v22 is all 0's */ - "EOR v19.16b, v19.16b, v21.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "EOR v18.16b, v18.16b, v21.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "RBIT v17.16b, v19.16b \n" - "STR q17, [%[xOut]] \n" /* GHASH x value for partial blocks */ - - :[out] "=r" (out), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (in) - ,[xOut] "=r" (xPt),"=m" (aes->gcm.H) - :"0" (out), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), - [input] "3" (in) - ,[inX] "4" (xPt), [inY] "m" (aes->gcm.H) - : "cc", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ,"v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" - ); - } - - /* take care of partial block sizes leftover */ - if (partial != 0) { - IncrementGcmCounter(counter); - wc_AesEncrypt(aes, counter, scratch); - xorbuf(scratch, in, partial); - XMEMCPY(out, scratch, partial); - - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - XMEMCPY(scratch, out, partial); - xorbuf(x, scratch, AES_BLOCK_SIZE); - GMULT(x, aes->gcm.H); - } - - /* Hash in the lengths of A and C in bits */ - XMEMSET(scratch, 0, AES_BLOCK_SIZE); - FlattenSzInBits(&scratch[0], authInSz); - FlattenSzInBits(&scratch[8], sz); - xorbuf(x, scratch, AES_BLOCK_SIZE); - XMEMCPY(scratch, x, AES_BLOCK_SIZE); - - keyPt = (byte*)aes->key; __asm__ __volatile__ ( - - "LD1 {v16.16b}, [%[tag]] \n" - "LD1 {v17.16b}, %[h] \n" - "RBIT v16.16b, v16.16b \n" - - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - "PMULL v18.1q, v16.1d, v17.1d \n" /* a0 * b0 = C */ - "PMULL2 v19.1q, v16.2d, v17.2d \n" /* a1 * b1 = D */ - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "EXT v20.16b, v17.16b, v17.16b, #8 \n" /* b0b1 -> b1b0 */ - "LD1 {v9.2d-v11.2d}, [%[Key]], #48\n" - "LD1 {v28.2d-v31.2d}, [%[Key]], #64\n" - "PMULL v21.1q, v16.1d, v20.1d \n" /* a0 * b1 = E */ - "PMULL2 v20.1q, v16.2d, v20.2d \n" /* a1 * b0 = F */ - "LD1 {v0.2d}, [%[ctr]] \n" - - "#Set a register to all 0s using EOR \n" - "EOR v22.16b, v22.16b, v22.16b \n" - "EOR v20.16b, v20.16b, v21.16b \n" /* F ^ E */ - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" /* get (F^E)[0] */ - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" /* low 128 bits in v3 */ - "EXT v21.16b, v20.16b, v22.16b, #8 \n" /* get (F^E)[1] */ - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" /* high 128 bits in v4 */ + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" "MOVI v23.16b, #0x87 \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v17.16b, v17.16b \n" "USHR v23.2d, v23.2d, #56 \n" - "PMULL2 v20.1q, v19.2d, v23.2d \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v20.16b, v22.16b, #8 \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v19.16b, v19.16b, v21.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "EXT v21.16b, v22.16b, v20.16b, #8 \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "EOR v18.16b, v18.16b, v21.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v28.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v29.16b \n" - "AESMC v0.16b, v0.16b \n" - "PMULL v20.1q, v19.1d, v23.1d \n" - "EOR v19.16b, v18.16b, v20.16b \n" - "AESE v0.16b, v30.16b \n" - "RBIT v19.16b, v19.16b \n" - "EOR v0.16b, v0.16b, v31.16b \n" - "EOR v19.16b, v19.16b, v0.16b \n" - "STR q19, [%[out]] \n" + "CBZ %w[aSz], 120f \n" - :[out] "=r" (sPt), "=r" (keyPt), "=r" (iCtr) - :[tag] "0" (sPt), [Key] "1" (keyPt), - [ctr] "2" (iCtr) , [h] "m" (aes->gcm.H) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10","v11","v12","v13","v14", - "v15", "v16", "v17","v18", "v19", "v20","v21","v22","v23", - "v24","v25","v26","v27","v28","v29","v30","v31" + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Encrypt plaintext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v10.1d, v16.1d \n" + "PMULL2 v19.1q, v10.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v10.1d, v20.1d \n" + "PMULL2 v20.1q, v10.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v19.16b \n" + + "# First encrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v12.16b, v12.16b, v5.16b \n" + "EOR v13.16b, v13.16b, v6.16b \n" + "EOR v14.16b, v14.16b, v7.16b \n" + "EOR v15.16b, v15.16b, v8.16b \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "ST1 {v12.2d-v15.2d}, [%[out]], #64 \n \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v9.1d \n" + "PMULL2 v3.1q, v15.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v9.1d \n" + "PMULL2 v15.1q, v15.2d, v9.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v10.1d \n" + "PMULL2 v3.1q, v14.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v10.1d \n" + "PMULL2 v14.1q, v14.2d, v10.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v11.1d \n" + "PMULL2 v3.1q, v13.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v11.1d \n" + "PMULL2 v13.1q, v13.2d, v11.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v4.1d \n" + "PMULL2 v3.1q, v12.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v4.1d \n" + "PMULL2 v12.1q, v12.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First encrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v18.16b, v18.16b, v27.16b \n" + "EOR v19.16b, v19.16b, v28.16b \n" + "EOR v20.16b, v20.16b, v29.16b \n" + "EOR v21.16b, v21.16b, v30.16b \n" + "# Store cipher text \n" + "ST1 {v18.2d-v21.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "SUB %[Key], %[Key], #32 \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Encrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v31.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + + "# When only one full block to encrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + + "# Interweave GHASH and encrypt if more then 1 block \n" + "2: \n" + "RBIT v15.16b, v15.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[out]], #16 \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "LD1 {v31.2d}, [%[input]], #16 \n" + "B 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v15.16b, v0.16b, v31.16b \n \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "MOV x15, #16 \n" + "EOR w14, w14, w14 \n" + "SUB x15, x15, x11 \n" + "25: \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 25b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.2d}, [%[ctr]] \n" + "ST1 {v22.2d}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v15.d[0], %x[aSz] \n" + "MOV v15.d[1], %x[sz] \n" + "REV64 v15.16b, v15.16b \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]], #32 \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v12.2d, v13.2d}, [%[Key]] \n" + "SUB %[Key], %[Key], #32 \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "ST1 {v0.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, %x[tagSz] \n" + "44: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[tag]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 44b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "41: \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "w11", "w12", "w13", "w14", "w15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" ); - if (authTagSz > AES_BLOCK_SIZE) { - XMEMCPY(authTag, scratch, AES_BLOCK_SIZE); - } - else { - /* authTagSz can be smaller than AES_BLOCK_SIZE */ - XMEMCPY(authTag, scratch, authTagSz); - } - return 0; } #endif /* WOLFSSL_AES_256 */ - /* aarch64 with PMULL and PMULL2 * Encrypt and tag data using AES with GCM mode. * aes: Aes structure having already been set with set key function @@ -2597,20 +7199,17 @@ static int Aes256GcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, * Algorithm 5 */ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) + const byte* iv, word32 ivSz, byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) { /* sanity checks */ - if (aes == NULL || (iv == NULL && ivSz > 0) || - (authTag == NULL) || - (authIn == NULL && authInSz > 0) || - (ivSz == 0)) { + if ((aes == NULL) || (iv == NULL && ivSz > 0) || (authTag == NULL) || + ((authIn == NULL) && (authInSz > 0)) || (ivSz == 0)) { WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); return BAD_FUNC_ARG; } - if (authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ || authTagSz > AES_BLOCK_SIZE) { + if ((authTagSz < WOLFSSL_MIN_AUTH_TAG_SZ) || (authTagSz > AES_BLOCK_SIZE)) { WOLFSSL_MSG("GcmEncrypt authTagSz error"); return BAD_FUNC_ARG; } @@ -2637,8 +7236,5315 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, } } - #ifdef HAVE_AES_DECRYPT +#ifdef WOLFSSL_AES_128 +/* internal function : see wc_AesGcmDecrypt */ +static int Aes128GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + byte *ctr = counter; + byte* keyPt = (byte*)aes->key; + int ret = 0; + + XMEMSET(counter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Decrypt ciphertext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v9.1d, v16.1d \n" + "PMULL2 v19.1q, v9.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v9.1d, v20.1d \n" + "PMULL2 v20.1q, v9.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v19.16b \n" + + "# First decrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First decrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "# Store cipher text \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "EOR v27.16b, v27.16b, v11.16b \n" + "AESE v28.16b, v10.16b \n" + "EOR v28.16b, v28.16b, v11.16b \n" + "AESE v29.16b, v10.16b \n" + "EOR v29.16b, v29.16b, v11.16b \n" + "AESE v30.16b, v10.16b \n" + "EOR v30.16b, v30.16b, v11.16b \n" + + "# XOR in input \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "# Store cipher text \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Decrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v28.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "# When only one full block to decrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "# Interweave GHASH and decrypt if more then 1 block \n" + "2: \n" + "RBIT v28.16b, v28.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "LD1 {v28.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "CMP w11, #16 \n" + "BGE 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "RBIT v31.16b, v31.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v31.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "RBIT v31.16b, v31.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "EOR v0.16b, v0.16b, v31.16b \n \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "SUB %[scratch], %[scratch], x11 \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.16b}, [%[ctr]] \n" + "ST1 {v22.16b}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v28.d[0], %x[aSz] \n" + "MOV v28.d[1], %x[sz] \n" + "REV64 v28.16b, v28.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "EOR v0.16b, v0.16b, v11.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "LD1 {v1.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "EOR v1.16b, v1.16b, v1.16b \n" + "MOV x15, %x[tagSz] \n" + "ST1 {v1.2d}, [%[scratch]] \n" + "43: \n" + "LDRB w14, [%[tag]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 43b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "LD1 {v1.2d}, [%[scratch]] \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV w14, #16 \n" + "SUB w14, w14, %w[tagSz] \n" + "ADD %[scratch], %[scratch], %x[tagSz] \n" + "44: \n" + "STRB wzr, [%[scratch]], #1 \n" + "SUB w14, w14, #1 \n" + "CBNZ w14, 44b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v0.2d}, [%[scratch]] \n" + "41: \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "MOV v1.D[0], v0.D[1] \n" + "EOR v0.8b, v0.8b, v1.8b \n" + "MOV %x[ret], v0.D[0] \n" + "CMP %x[ret], #0 \n" + "MOV w11, #-180 \n" + "CSETM %w[ret], ne \n" + "AND %w[ret], %w[ret], w11 \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn), + [ret] "+r" (ret) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "w11", "w12", "w13", "w14", "w15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + return ret; +} +#endif /* WOLFSSL_AES_128 */ +#ifdef WOLFSSL_AES_192 +/* internal function : see wc_AesGcmDecrypt */ +static int Aes192GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + byte *ctr = counter; + byte* keyPt = (byte*)aes->key; + int ret = 0; + + XMEMSET(counter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Decrypt ciphertext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v9.1d, v16.1d \n" + "PMULL2 v19.1q, v9.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v9.1d, v20.1d \n" + "PMULL2 v20.1q, v9.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v19.16b \n" + + "# First decrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First decrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "# Store cipher text \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v12.16b \n" + "EOR v27.16b, v27.16b, v13.16b \n" + "AESE v28.16b, v12.16b \n" + "EOR v28.16b, v28.16b, v13.16b \n" + "AESE v29.16b, v12.16b \n" + "EOR v29.16b, v29.16b, v13.16b \n" + "AESE v30.16b, v12.16b \n" + "EOR v30.16b, v30.16b, v13.16b \n" + + "# XOR in input \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "# Store cipher text \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "# Decrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v28.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "# When only one full block to decrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "# Interweave GHASH and decrypt if more then 1 block \n" + "2: \n" + "RBIT v28.16b, v28.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "LD1 {v28.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "CMP w11, #16 \n" + "BGE 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "RBIT v31.16b, v31.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v31.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "RBIT v31.16b, v31.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "EOR v0.16b, v0.16b, v31.16b \n \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "SUB %[scratch], %[scratch], x11 \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.16b}, [%[ctr]] \n" + "ST1 {v22.16b}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v28.d[0], %x[aSz] \n" + "MOV v28.d[1], %x[sz] \n" + "REV64 v28.16b, v28.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "EOR v0.16b, v0.16b, v13.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "LD1 {v1.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "EOR v1.16b, v1.16b, v1.16b \n" + "MOV x15, %x[tagSz] \n" + "ST1 {v1.2d}, [%[scratch]] \n" + "43: \n" + "LDRB w14, [%[tag]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 43b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "LD1 {v1.2d}, [%[scratch]] \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV w14, #16 \n" + "SUB w14, w14, %w[tagSz] \n" + "ADD %[scratch], %[scratch], %x[tagSz] \n" + "44: \n" + "STRB wzr, [%[scratch]], #1 \n" + "SUB w14, w14, #1 \n" + "CBNZ w14, 44b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v0.2d}, [%[scratch]] \n" + "41: \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "MOV v1.D[0], v0.D[1] \n" + "EOR v0.8b, v0.8b, v1.8b \n" + "MOV %x[ret], v0.D[0] \n" + "CMP %x[ret], #0 \n" + "MOV w11, #-180 \n" + "CSETM %w[ret], ne \n" + "AND %w[ret], %w[ret], w11 \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn), + [ret] "+r" (ret) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "w11", "w12", "w13", "w14", "w15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + return ret; +} +#endif /* WOLFSSL_AES_192 */ +#ifdef WOLFSSL_AES_256 +/* internal function : see wc_AesGcmDecrypt */ +static int Aes256GcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) +{ + byte counter[AES_BLOCK_SIZE]; + byte scratch[AES_BLOCK_SIZE]; + byte *ctr = counter; + byte* keyPt = (byte*)aes->key; + int ret = 0; + + XMEMSET(counter, 0, AES_BLOCK_SIZE); + if (ivSz == GCM_NONCE_MID_SZ) { + XMEMCPY(counter, iv, GCM_NONCE_MID_SZ); + counter[AES_BLOCK_SIZE - 1] = 1; + } + else { + GHASH(&aes->gcm, NULL, 0, iv, ivSz, counter, AES_BLOCK_SIZE); + GMULT(counter, aes->gcm.H); + } + + __asm__ __volatile__ ( + "LD1 {v16.16b}, %[h] \n" + "# v23 = 0x00000000000000870000000000000087 reflected 0xe1.... \n" + "MOVI v23.16b, #0x87 \n" + "EOR v17.16b, v17.16b, v17.16b \n" + "USHR v23.2d, v23.2d, #56 \n" + "CBZ %w[aSz], 120f \n" + + "MOV w12, %w[aSz] \n" + + "# GHASH AAD \n" + "CMP x12, #64 \n" + "BLT 115f \n" + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "114: \n" + "LD1 {v18.2d-v21.2d}, [%[aad]], #64 \n" + "SUB x12, x12, #64 \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v30.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v30.16b, #8 \n" + "PMULL2 v14.1q, v30.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "CMP x12, #64 \n" + "BGE 114b \n" + "CBZ x12, 120f \n" + "115: \n" + "CMP x12, #16 \n" + "BLT 112f \n" + "111: \n" + "LD1 {v15.2d}, [%[aad]], #16 \n" + "SUB x12, x12, #16 \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "CMP x12, #16 \n" + "BGE 111b \n" + "CBZ x12, 120f \n" + "112: \n" + "# Partial AAD \n" + "EOR v15.16b, v15.16b, v15.16b \n" + "MOV x14, x12 \n" + "ST1 {v15.2d}, [%[scratch]] \n" + "113: \n" + "LDRB w13, [%[aad]], #1 \n" + "STRB w13, [%[scratch]], #1 \n" + "SUB x14, x14, #1 \n" + "CBNZ x14, 113b \n" + "SUB %[scratch], %[scratch], x12 \n" + "LD1 {v15.2d}, [%[scratch]] \n" + "RBIT v15.16b, v15.16b \n" + "EOR v17.16b, v17.16b, v15.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "120: \n" + + "# Decrypt ciphertext and GHASH ciphertext \n" + "LDR w12, [%[ctr], #12] \n" + "MOV w11, %w[sz] \n" + "REV w12, w12 \n" + "CMP w11, #64 \n" + "BLT 80f \n" + "CMP %w[aSz], #64 \n" + "BGE 82f \n" + + "# Calculate H^[1-4] - GMULT partials \n" + "# Square H => H^2 \n" + "PMULL2 v19.1q, v16.2d, v16.2d \n" + "PMULL v18.1q, v16.1d, v16.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v24.16b, v18.16b, v19.16b \n" + "# Multiply H and H^2 => H^3 \n" + "PMULL v18.1q, v24.1d, v16.1d \n" + "PMULL2 v19.1q, v24.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v24.1d, v20.1d \n" + "PMULL2 v20.1q, v24.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v25.16b, v18.16b, v20.16b \n" + "# Square H^2 => H^4 \n" + "PMULL2 v19.1q, v24.2d, v24.2d \n" + "PMULL v18.1q, v24.1d, v24.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v26.16b, v18.16b, v19.16b \n" + "82: \n" + "# Should we do 8 blocks at a time? \n" + "CMP w11, #512 \n" + "BLT 80f \n" + + "# Calculate H^[5-8] - GMULT partials \n" + "# Multiply H and H^4 => H^5 \n" + "PMULL v18.1q, v26.1d, v16.1d \n" + "PMULL2 v19.1q, v26.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v26.1d, v20.1d \n" + "PMULL2 v20.1q, v26.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v4.16b, v18.16b, v20.16b \n" + "# Square H^3 - H^6 \n" + "PMULL2 v19.1q, v25.2d, v25.2d \n" + "PMULL v18.1q, v25.1d, v25.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v9.16b, v18.16b, v19.16b \n" + "# Multiply H and H^6 => H^7 \n" + "PMULL v18.1q, v9.1d, v16.1d \n" + "PMULL2 v19.1q, v9.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v9.1d, v20.1d \n" + "PMULL2 v20.1q, v9.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v10.16b, v18.16b, v20.16b \n" + "# Square H^4 => H^8 \n" + "PMULL2 v19.1q, v26.2d, v26.2d \n" + "PMULL v18.1q, v26.1d, v26.1d \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v19.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v11.16b, v18.16b, v19.16b \n" + + "# First decrypt - no GHASH \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + + "81: \n" + "LDR q1, [%[Key]] \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "LD1 {v5.2d}, [%[ctr]] \n" + "ADD w14, w12, #2 \n" + "MOV v6.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v7.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v8.16b, v5.16b \n" + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "REV w15, w15 \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "REV w14, w14 \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "REV w13, w13 \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "REV w16, w12 \n" + "MOV v5.S[3], w15 \n" + "MOV v6.S[3], w14 \n" + "MOV v7.S[3], w13 \n" + "MOV v8.S[3], w16 \n" + "# Calculate next 4 counters (+5-8) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v5.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v5.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v5.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v5.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "REV w15, w15 \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "REV w14, w14 \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "REV w13, w13 \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 8 counters \n" + "LDR q22, [%[Key], #16] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "LDR q1, [%[Key], #32] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "LDR q22, [%[Key], #48] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" + "LDR q1, [%[Key], #64] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" + "LDR q22, [%[Key], #80] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #128 \n" + "LDR q1, [%[Key], #96] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #112] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #128] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #144] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #160] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q22, [%[Key], #176] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LDR q1, [%[Key], #192] \n" + "AESE v5.16b, v22.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v22.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v22.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v22.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v22.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v22.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v22.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v22.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v12.2d-v15.2d}, [%[input]], #64 \n" + "LDP q22, q31, [%[Key], #208] \n" + "AESE v5.16b, v1.16b \n" + "AESMC v5.16b, v5.16b \n" + "AESE v6.16b, v1.16b \n" + "AESMC v6.16b, v6.16b \n" + "AESE v7.16b, v1.16b \n" + "AESMC v7.16b, v7.16b \n" + "AESE v8.16b, v1.16b \n" + "AESMC v8.16b, v8.16b \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v5.16b, v22.16b \n" + "EOR v5.16b, v5.16b, v31.16b \n" + "AESE v6.16b, v22.16b \n" + "EOR v6.16b, v6.16b, v31.16b \n" + "AESE v7.16b, v22.16b \n" + "EOR v7.16b, v7.16b, v31.16b \n" + "AESE v8.16b, v22.16b \n" + "EOR v8.16b, v8.16b, v31.16b \n" + "AESE v27.16b, v22.16b \n" + "EOR v27.16b, v27.16b, v31.16b \n" + "AESE v28.16b, v22.16b \n" + "EOR v28.16b, v28.16b, v31.16b \n" + "AESE v29.16b, v22.16b \n" + "EOR v29.16b, v29.16b, v31.16b \n" + "AESE v30.16b, v22.16b \n" + "EOR v30.16b, v30.16b, v31.16b \n" + + "# XOR in input \n" + "EOR v5.16b, v5.16b, v12.16b \n" + "EOR v6.16b, v6.16b, v13.16b \n" + "EOR v7.16b, v7.16b, v14.16b \n" + "EOR v8.16b, v8.16b, v15.16b \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "ST1 {v5.2d-v8.2d}, [%[out]], #64 \n \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + + "CMP w11, #128 \n" + "BGE 81b \n" + + "# GHASH - 8 blocks \n" + "RBIT v12.16b, v12.16b \n" + "RBIT v13.16b, v13.16b \n" + "RBIT v14.16b, v14.16b \n" + "RBIT v15.16b, v15.16b \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v12.16b, v12.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v3.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v3.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v2.1q, v20.1d, v24.1d \n" + "PMULL2 v3.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v3.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v3.16b \n" +#else + "EOR v20.16b, v20.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v2.1q, v19.1d, v25.1d \n" + "PMULL2 v3.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v3.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v3.16b \n" +#else + "EOR v19.16b, v19.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v2.1q, v18.1d, v26.1d \n" + "PMULL2 v3.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v3.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v3.16b \n" +#else + "EOR v18.16b, v18.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^5 \n" + "PMULL v2.1q, v15.1d, v4.1d \n" + "PMULL2 v3.1q, v15.2d, v4.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v15.16b, v15.16b, v15.16b, #8 \n" + "PMULL v3.1q, v15.1d, v4.1d \n" + "PMULL2 v15.1q, v15.2d, v4.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v15.16b, v3.16b \n" +#else + "EOR v15.16b, v15.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^6 \n" + "PMULL v2.1q, v14.1d, v9.1d \n" + "PMULL2 v3.1q, v14.2d, v9.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v14.16b, v14.16b, v14.16b, #8 \n" + "PMULL v3.1q, v14.1d, v9.1d \n" + "PMULL2 v14.1q, v14.2d, v9.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v14.16b, v3.16b \n" +#else + "EOR v14.16b, v14.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^7 \n" + "PMULL v2.1q, v13.1d, v10.1d \n" + "PMULL2 v3.1q, v13.2d, v10.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v13.16b, v13.16b, v13.16b, #8 \n" + "PMULL v3.1q, v13.1d, v10.1d \n" + "PMULL2 v13.1q, v13.2d, v10.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v13.16b, v3.16b \n" +#else + "EOR v13.16b, v13.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v13.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^8 \n" + "PMULL v2.1q, v12.1d, v11.1d \n" + "PMULL2 v3.1q, v12.2d, v11.2d \n" + "EOR v17.16b, v17.16b, v2.16b \n" + "EOR v0.16b, v0.16b, v3.16b \n" + "EXT v12.16b, v12.16b, v12.16b, #8 \n" + "PMULL v3.1q, v12.1d, v11.1d \n" + "PMULL2 v12.1q, v12.2d, v11.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v12.16b, v3.16b \n" +#else + "EOR v12.16b, v12.16b, v3.16b \n" + "EOR v31.16b, v31.16b, v12.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v3.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v2.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v3.16b, v3.16b, v31.16b, v2.16b \n" +#else + "EOR v3.16b, v3.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v3.16b, v3.16b, v2.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v2.1q, v3.2d, v23.2d \n" + "MOV v17.D[1], v3.D[0] \n" + "EOR v17.16b, v17.16b, v2.16b \n" + + "80: \n" + "LD1 {v22.2d}, [%[ctr]] \n" + "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" + "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" + "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" + "LD1 {v12.2d-v13.2d}, [%[Key]], #32 \n" + "LD1 {v14.2d-v15.2d}, [%[Key]] \n" + "# Can we do 4 blocks at a time? \n" + "CMP w11, #64 \n" + "BLT 10f \n" + + "# First decrypt - no GHASH \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "REV w15, w15 \n" + "REV w14, w14 \n" + "REV w13, w13 \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "# Store cipher text \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BLT 12f \n" + + "11: \n" + "# Calculate next 4 counters (+1-4) \n" + "ADD w15, w12, #1 \n" + "MOV v27.16b, v22.16b \n" + "ADD w14, w12, #2 \n" + "MOV v28.16b, v22.16b \n" + "ADD w13, w12, #3 \n" + "MOV v29.16b, v22.16b \n" + "ADD w12, w12, #4 \n" + "MOV v30.16b, v22.16b \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "REV w15, w15 \n" + "RBIT v19.16b, v19.16b \n" + "REV w14, w14 \n" + "RBIT v20.16b, v20.16b \n" + "REV w13, w13 \n" + "RBIT v21.16b, v21.16b \n" + "REV w16, w12 \n" + "MOV v27.S[3], w15 \n" + "MOV v28.S[3], w14 \n" + "MOV v29.S[3], w13 \n" + "MOV v30.S[3], w16 \n" + + "# Encrypt 4 counters \n" + "AESE v27.16b, v1.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "AESE v28.16b, v1.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "AESE v29.16b, v1.16b \n" + "AESMC v29.16b, v29.16b \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "AESE v30.16b, v1.16b \n" + "AESMC v30.16b, v30.16b \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "AESE v27.16b, v2.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "AESE v28.16b, v2.16b \n" + "AESMC v28.16b, v28.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "AESE v29.16b, v2.16b \n" + "AESMC v29.16b, v29.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v30.16b, v2.16b \n" + "AESMC v30.16b, v30.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "AESE v27.16b, v3.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" + "AESE v28.16b, v3.16b \n" + "AESMC v28.16b, v28.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v29.16b, v3.16b \n" + "AESMC v29.16b, v29.16b \n" + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "AESE v30.16b, v3.16b \n" + "AESMC v30.16b, v30.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v27.16b, v4.16b \n" + "AESMC v27.16b, v27.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "AESE v28.16b, v4.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" + "AESE v29.16b, v4.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v4.16b \n" + "AESMC v30.16b, v30.16b \n" + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "AESE v27.16b, v5.16b \n" + "AESMC v27.16b, v27.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "AESE v28.16b, v5.16b \n" + "AESMC v28.16b, v28.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "AESE v29.16b, v5.16b \n" + "AESMC v29.16b, v29.16b \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" + "AESE v30.16b, v5.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "SUB w11, w11, #64 \n" + "AESE v27.16b, v6.16b \n" + "AESMC v27.16b, v27.16b \n" + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "AESE v28.16b, v6.16b \n" + "AESMC v28.16b, v28.16b \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" + "AESE v29.16b, v6.16b \n" + "AESMC v29.16b, v29.16b \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v30.16b, v6.16b \n" + "AESMC v30.16b, v30.16b \n" +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "AESE v27.16b, v7.16b \n" + "AESMC v27.16b, v27.16b \n" + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "AESE v28.16b, v7.16b \n" + "AESMC v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "AESE v29.16b, v7.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v7.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v8.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v8.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v8.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v8.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v9.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v9.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v9.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v9.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v10.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v10.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v10.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v10.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v11.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v11.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v11.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v11.16b \n" + "AESMC v30.16b, v30.16b \n" + "# Load plaintext \n" + "LD1 {v18.2d-v21.2d}, [%[input]], #64 \n" + "AESE v27.16b, v12.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v12.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v12.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v12.16b \n" + "AESMC v30.16b, v30.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "AESE v27.16b, v13.16b \n" + "AESMC v27.16b, v27.16b \n" + "AESE v28.16b, v13.16b \n" + "AESMC v28.16b, v28.16b \n" + "AESE v29.16b, v13.16b \n" + "AESMC v29.16b, v29.16b \n" + "AESE v30.16b, v13.16b \n" + "AESMC v30.16b, v30.16b \n" + "AESE v27.16b, v14.16b \n" + "EOR v27.16b, v27.16b, v15.16b \n" + "AESE v28.16b, v14.16b \n" + "EOR v28.16b, v28.16b, v15.16b \n" + "AESE v29.16b, v14.16b \n" + "EOR v29.16b, v29.16b, v15.16b \n" + "AESE v30.16b, v14.16b \n" + "EOR v30.16b, v30.16b, v15.16b \n" + + "# XOR in input \n" + "EOR v27.16b, v27.16b, v18.16b \n" + "EOR v28.16b, v28.16b, v19.16b \n" + "EOR v29.16b, v29.16b, v20.16b \n" + "EOR v30.16b, v30.16b, v21.16b \n" + "# Store cipher text \n" + "ST1 {v27.2d-v30.2d}, [%[out]], #64 \n \n" + "CMP w11, #64 \n" + "BGE 11b \n" + + "12: \n" + "# GHASH - 4 blocks \n" + "RBIT v18.16b, v18.16b \n" + "RBIT v19.16b, v19.16b \n" + "RBIT v20.16b, v20.16b \n" + "RBIT v21.16b, v21.16b \n" + "EOR v18.16b, v18.16b, v17.16b \n" + "# x[0-2] = C * H^1 \n" + "PMULL v17.1q, v21.1d, v16.1d \n" + "PMULL2 v0.1q, v21.2d, v16.2d \n" + "EXT v21.16b, v21.16b, v21.16b, #8 \n" + "PMULL v31.1q, v21.1d, v16.1d \n" + "PMULL2 v15.1q, v21.2d, v16.2d \n" + "EOR v31.16b, v31.16b, v15.16b \n" + "# x[0-2] += C * H^2 \n" + "PMULL v14.1q, v20.1d, v24.1d \n" + "PMULL2 v15.1q, v20.2d, v24.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v20.16b, v20.16b, v20.16b, #8 \n" + "PMULL v15.1q, v20.1d, v24.1d \n" + "PMULL2 v20.1q, v20.2d, v24.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v20.16b, v15.16b \n" +#else + "EOR v20.16b, v20.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v20.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^3 \n" + "PMULL v14.1q, v19.1d, v25.1d \n" + "PMULL2 v15.1q, v19.2d, v25.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v19.16b, v19.16b, v19.16b, #8 \n" + "PMULL v15.1q, v19.1d, v25.1d \n" + "PMULL2 v19.1q, v19.2d, v25.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v19.16b, v15.16b \n" +#else + "EOR v19.16b, v19.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v19.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# x[0-2] += C * H^4 \n" + "PMULL v14.1q, v18.1d, v26.1d \n" + "PMULL2 v15.1q, v18.2d, v26.2d \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n" + "EXT v18.16b, v18.16b, v18.16b, #8 \n" + "PMULL v15.1q, v18.1d, v26.1d \n" + "PMULL2 v18.1q, v18.2d, v26.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v31.16b, v31.16b, v18.16b, v15.16b \n" +#else + "EOR v18.16b, v18.16b, v15.16b \n" + "EOR v31.16b, v31.16b, v18.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "# Reduce X = x[0-2] \n" + "EXT v15.16b, v17.16b, v0.16b, #8 \n" + "PMULL2 v14.1q, v0.2d, v23.2d \n" +#ifdef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR3 v15.16b, v15.16b, v31.16b, v14.16b \n" +#else + "EOR v15.16b, v15.16b, v31.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ +#ifndef WOLFSSL_ARMASM_CRYPTO_SHA3 + "EOR v15.16b, v15.16b, v14.16b \n" +#endif /* WOLFSSL_ARMASM_CRYPTO_SHA3 */ + "PMULL2 v14.1q, v15.2d, v23.2d \n" + "MOV v17.D[1], v15.D[0] \n" + "EOR v17.16b, v17.16b, v14.16b \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + + "10: \n" + "CBZ w11, 30f \n" + "CMP w11, #16 \n" + "BLT 20f \n" + "LD1 {v14.2d, v15.2d}, [%[Key]] \n" + "# Decrypt first block for GHASH \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "LD1 {v28.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + + "# When only one full block to decrypt go straight to GHASH \n" + "CMP w11, 16 \n" + "BLT 1f \n" + + "# Interweave GHASH and decrypt if more then 1 block \n" + "2: \n" + "RBIT v28.16b, v28.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "SUB w11, w11, #16 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "LD1 {v28.2d}, [%[input]], #16 \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" + "EOR v0.16b, v0.16b, v28.16b \n \n" + "ST1 {v0.2d}, [%[out]], #16 \n" + "CMP w11, #16 \n" + "BGE 2b \n" + + "# GHASH on last block \n" + "1: \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "MOV v18.D[1], v21.D[0] \n" + "EOR v17.16b, v18.16b, v20.16b \n" + + "20: \n" + "CBZ w11, 30f \n" + "EOR v31.16b, v31.16b, v31.16b \n" + "MOV x15, x11 \n" + "ST1 {v31.2d}, [%[scratch]] \n" + "23: \n" + "LDRB w14, [%[input]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 23b \n" + "SUB %[scratch], %[scratch], x11 \n" + "LD1 {v31.2d}, [%[scratch]] \n" + "RBIT v31.16b, v31.16b \n" + "ADD w12, w12, #1 \n" + "MOV v0.16b, v22.16b \n" + "REV w13, w12 \n" + "MOV v0.S[3], w13 \n" + "EOR v17.16b, v17.16b, v31.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "RBIT v31.16b, v31.16b \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" + "EOR v0.16b, v0.16b, v31.16b \n \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV x15, x11 \n" + "24: \n" + "LDRB w14, [%[scratch]], #1 \n" + "STRB w14, [%[out]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 24b \n" + "SUB %[scratch], %[scratch], x11 \n" + + "30: \n" + "# store current counter value at the end \n" + "REV w13, w12 \n" + "MOV v22.S[3], w13 \n" + "LD1 {v0.16b}, [%[ctr]] \n" + "ST1 {v22.16b}, [%[ctr]] \n" + + "LSL %x[aSz], %x[aSz], #3 \n" + "LSL %x[sz], %x[sz], #3 \n" + "MOV v28.d[0], %x[aSz] \n" + "MOV v28.d[1], %x[sz] \n" + "REV64 v28.16b, v28.16b \n" + "RBIT v28.16b, v28.16b \n" + "EOR v17.16b, v17.16b, v28.16b \n" + "PMULL v18.1q, v17.1d, v16.1d \n" + "PMULL2 v19.1q, v17.2d, v16.2d \n" + "EXT v20.16b, v16.16b, v16.16b, #8 \n" + "AESE v0.16b, v1.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL v21.1q, v17.1d, v20.1d \n" + "PMULL2 v20.1q, v17.2d, v20.2d \n" + "AESE v0.16b, v2.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v20.16b, v20.16b, v21.16b \n" + "AESE v0.16b, v3.16b \n" + "AESMC v0.16b, v0.16b \n" + "EXT v21.16b, v18.16b, v19.16b, #8 \n" + "AESE v0.16b, v4.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v5.16b \n" + "AESMC v0.16b, v0.16b \n" + "# Reduce \n" + "PMULL2 v20.1q, v19.2d, v23.2d \n" + "AESE v0.16b, v6.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v21.16b, v21.16b, v20.16b \n" + "AESE v0.16b, v7.16b \n" + "AESMC v0.16b, v0.16b \n" + "PMULL2 v20.1q, v21.2d, v23.2d \n" + "AESE v0.16b, v8.16b \n" + "AESMC v0.16b, v0.16b \n" + "MOV v18.D[1], v21.D[0] \n" + "AESE v0.16b, v9.16b \n" + "AESMC v0.16b, v0.16b \n" + "EOR v17.16b, v18.16b, v20.16b \n" + "AESE v0.16b, v10.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v11.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v12.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v13.16b \n" + "AESMC v0.16b, v0.16b \n" + "AESE v0.16b, v14.16b \n" + "EOR v0.16b, v0.16b, v15.16b \n \n" + "RBIT v17.16b, v17.16b \n" + "EOR v0.16b, v0.16b, v17.16b \n \n" + "CMP %w[tagSz], #16 \n" + "BNE 40f \n" + "LD1 {v1.2d}, [%[tag]] \n" + "B 41f \n" + "40: \n" + "EOR v1.16b, v1.16b, v1.16b \n" + "MOV x15, %x[tagSz] \n" + "ST1 {v1.2d}, [%[scratch]] \n" + "43: \n" + "LDRB w14, [%[tag]], #1 \n" + "STRB w14, [%[scratch]], #1 \n" + "SUB x15, x15, #1 \n" + "CBNZ x15, 43b \n" + "SUB %[scratch], %[scratch], %x[tagSz] \n" + "LD1 {v1.2d}, [%[scratch]] \n" + "ST1 {v0.2d}, [%[scratch]] \n" + "MOV w14, #16 \n" + "SUB w14, w14, %w[tagSz] \n" + "ADD %[scratch], %[scratch], %x[tagSz] \n" + "44: \n" + "STRB wzr, [%[scratch]], #1 \n" + "SUB w14, w14, #1 \n" + "CBNZ w14, 44b \n" + "SUB %[scratch], %[scratch], #16 \n" + "LD1 {v0.2d}, [%[scratch]] \n" + "41: \n" + "EOR v0.16b, v0.16b, v1.16b \n" + "MOV v1.D[0], v0.D[1] \n" + "EOR v0.8b, v0.8b, v1.8b \n" + "MOV %x[ret], v0.D[0] \n" + "CMP %x[ret], #0 \n" + "MOV w11, #-180 \n" + "CSETM %w[ret], ne \n" + "AND %w[ret], %w[ret], w11 \n" + + : [out] "+r" (out), [input] "+r" (in), [Key] "+r" (keyPt), + [aSz] "+r" (authInSz), [sz] "+r" (sz), [aad] "+r" (authIn), + [ret] "+r" (ret) + : [ctr] "r" (ctr), [scratch] "r" (scratch), + [h] "m" (aes->gcm.H), [tag] "r" (authTag), [tagSz] "r" (authTagSz) + : "cc", "memory", "w11", "w12", "w13", "w14", "w15", "w16", + "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + + return ret; +} +#endif /* WOLFSSL_AES_256 */ /* * Check tag and decrypt data using AES with GCM mode. * aes: Aes structure having already been set with set key function @@ -2652,295 +12558,44 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz, * authIn: additional data buffer * authInSz: size of additional data buffer */ -int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, - const byte* iv, word32 ivSz, - const byte* authTag, word32 authTagSz, - const byte* authIn, word32 authInSz) +int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz, + const byte* iv, word32 ivSz, const byte* authTag, word32 authTagSz, + const byte* authIn, word32 authInSz) { - word32 blocks = sz / AES_BLOCK_SIZE; - word32 partial = sz % AES_BLOCK_SIZE; - const byte* c = in; - byte* p = out; - byte counter[AES_BLOCK_SIZE]; - byte initialCounter[AES_BLOCK_SIZE]; - byte *ctr ; - byte scratch[AES_BLOCK_SIZE]; - - ctr = counter ; - /* sanity checks */ - if (aes == NULL || iv == NULL || (sz != 0 && (in == NULL || out == NULL)) || - authTag == NULL || authTagSz > AES_BLOCK_SIZE || authTagSz == 0 || - ivSz == 0) { + if ((aes == NULL) || (iv == NULL) || (authTag == NULL) || + (authTagSz > AES_BLOCK_SIZE) || (authTagSz == 0) || (ivSz == 0) || + ((sz != 0) && ((in == NULL) || (out == NULL)))) { WOLFSSL_MSG("a NULL parameter passed in when size is larger than 0"); return BAD_FUNC_ARG; } - XMEMSET(initialCounter, 0, AES_BLOCK_SIZE); - if (ivSz == GCM_NONCE_MID_SZ) { - XMEMCPY(initialCounter, iv, ivSz); - initialCounter[AES_BLOCK_SIZE - 1] = 1; - } - else { - GHASH(&aes->gcm, NULL, 0, iv, ivSz, initialCounter, AES_BLOCK_SIZE); - GMULT(initialCounter, aes->gcm.H); - } - XMEMCPY(ctr, initialCounter, AES_BLOCK_SIZE); - - /* Calculate the authTag again using the received auth data and the - * cipher text. */ - { - byte Tprime[AES_BLOCK_SIZE]; - byte EKY0[AES_BLOCK_SIZE]; - - GHASH(&aes->gcm, authIn, authInSz, in, sz, Tprime, sizeof(Tprime)); - GMULT(Tprime, aes->gcm.H); - wc_AesEncrypt(aes, ctr, EKY0); - xorbuf(Tprime, EKY0, sizeof(Tprime)); - - if (ConstantCompare(authTag, Tprime, authTagSz) != 0) { - return AES_GCM_AUTH_E; - } - } - - /* do as many blocks as possible */ - if (blocks > 0) { - /* pointer needed because it is incremented when read, causing - * an issue with call to encrypt/decrypt leftovers */ - byte* keyPt = (byte*)aes->key; - switch(aes->rounds) { + switch (aes->rounds) { #ifdef WOLFSSL_AES_128 - case 10: /* AES 128 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - - "#Create vector with the value 1 \n" - "MOVI v14.16b, #1 \n" - "USHR v14.2d, v14.2d, #56 \n" - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "EOR v13.16b, v13.16b, v13.16b \n" - "EXT v14.16b, v14.16b, v13.16b, #8 \n" - - "LD1 {v9.2d-v11.2d}, [%[Key]], #48 \n" - "LD1 {v12.2d}, [%[ctr]] \n" - "LD1 {v13.2d}, [%[input]], #16 \n" - - "1: \n" - "REV64 v12.16b, v12.16b \n" /* network order */ - "EXT v12.16b, v12.16b, v12.16b, #8 \n" - "ADD v12.4s, v12.4s, v14.4s \n" /* add 1 to counter */ - "EXT v12.16b, v12.16b, v12.16b, #8 \n" - "REV64 v12.16b, v12.16b \n" /* revert from network order */ - "MOV v0.16b, v12.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "SUB w11, w11, #1 \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "EOR v0.16b, v0.16b, v11.16b \n" - - "EOR v0.16b, v0.16b, v13.16b \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - - "CBZ w11, 2f \n" - "LD1 {v13.2d}, [%[input]], #16 \n" - "B 1b \n" - - "2: \n" - "#store current counter value at the end \n" - "ST1 {v12.16b}, [%[ctrOut]] \n" - - :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c) - :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), - [input] "3" (c) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14" - ); - break; + case 10: + return Aes128GcmDecrypt(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); #endif #ifdef WOLFSSL_AES_192 - case 12: /* AES 192 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - - "#Create vector with the value 1 \n" - "MOVI v16.16b, #1 \n" - "USHR v16.2d, v16.2d, #56 \n" - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "EOR v14.16b, v14.16b, v14.16b \n" - "EXT v16.16b, v16.16b, v14.16b, #8 \n" - - "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" - "LD1 {v13.2d}, [%[Key]], #16 \n" - "LD1 {v14.2d}, [%[ctr]] \n" - "LD1 {v15.2d}, [%[input]], #16 \n" - - "1: \n" - "REV64 v14.16b, v14.16b \n" /* network order */ - "EXT v14.16b, v14.16b, v14.16b, #8 \n" - "ADD v14.4s, v14.4s, v16.4s \n" /* add 1 to counter */ - "EXT v14.16b, v14.16b, v14.16b, #8 \n" - "REV64 v14.16b, v14.16b \n" /* revert from network order */ - "MOV v0.16b, v14.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "SUB w11, w11, #1 \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "EOR v0.16b, v0.16b, v13.16b \n" - - "EOR v0.16b, v0.16b, v15.16b \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - - "CBZ w11, 2f \n" - "LD1 {v15.2d}, [%[input]], #16 \n" - "B 1b \n" - - "2: \n" - "#store current counter value at the end \n" - "ST1 {v14.2d}, [%[ctrOut]] \n" - - :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c) - :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), - [input] "3" (c) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16" - ); - break; -#endif /* WOLFSSL_AES_192 */ + case 12: + return Aes192GcmDecrypt(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); +#endif #ifdef WOLFSSL_AES_256 - case 14: /* AES 256 BLOCK */ - __asm__ __volatile__ ( - "MOV w11, %w[blocks] \n" - "LD1 {v1.2d-v4.2d}, [%[Key]], #64 \n" - - "#Create vector with the value 1 \n" - "MOVI v18.16b, #1 \n" - "USHR v18.2d, v18.2d, #56 \n" - "LD1 {v5.2d-v8.2d}, [%[Key]], #64 \n" - "EOR v19.16b, v19.16b, v19.16b \n" - "EXT v18.16b, v18.16b, v19.16b, #8 \n" - - "LD1 {v9.2d-v12.2d}, [%[Key]], #64 \n" - "LD1 {v13.2d-v15.2d}, [%[Key]], #48 \n" - "LD1 {v17.2d}, [%[ctr]] \n" - "LD1 {v16.2d}, [%[input]], #16 \n" - - "1: \n" - "REV64 v17.16b, v17.16b \n" /* network order */ - "EXT v17.16b, v17.16b, v17.16b, #8 \n" - "ADD v17.4s, v17.4s, v18.4s \n" /* add 1 to counter */ - "EXT v17.16b, v17.16b, v17.16b, #8 \n" - "REV64 v17.16b, v17.16b \n" /* revert from network order */ - "MOV v0.16b, v17.16b \n" - "AESE v0.16b, v1.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v2.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v3.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v4.16b \n" - "AESMC v0.16b, v0.16b \n" - "SUB w11, w11, #1 \n" - "AESE v0.16b, v5.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v6.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v7.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v8.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v9.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v10.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v11.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v12.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v13.16b \n" - "AESMC v0.16b, v0.16b \n" - "AESE v0.16b, v14.16b \n" - "EOR v0.16b, v0.16b, v15.16b \n" - - "EOR v0.16b, v0.16b, v16.16b \n" - "ST1 {v0.2d}, [%[out]], #16 \n" - - "CBZ w11, 2f \n" - "LD1 {v16.2d}, [%[input]], #16 \n" - "B 1b \n" - - "2: \n" - "#store current counter value at the end \n" - "ST1 {v17.2d}, [%[ctrOut]] \n" - - :[out] "=r" (p), "=r" (keyPt), [ctrOut] "=r" (ctr), "=r" (c) - :"0" (p), [Key] "1" (keyPt), [ctr] "2" (ctr), [blocks] "r" (blocks), - [input] "3" (c) - : "cc", "memory", "w11", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", - "v16", "v17", "v18", "v19" - ); - break; -#endif /* WOLFSSL_AES_256 */ + case 14: + return Aes256GcmDecrypt(aes, out, in, sz, iv, ivSz, + authTag, authTagSz, authIn, authInSz); +#endif default: - WOLFSSL_MSG("Bad AES-GCM round value"); + WOLFSSL_MSG("AES-GCM invalid round number"); return BAD_FUNC_ARG; - } } - if (partial != 0) { - IncrementGcmCounter(ctr); - wc_AesEncrypt(aes, ctr, scratch); - - /* check if pointer is null after main AES-GCM blocks - * helps static analysis */ - if (p == NULL || c == NULL) { - return BAD_STATE_E; - } - xorbuf(scratch, c, partial); - XMEMCPY(p, scratch, partial); - } - return 0; } #endif /* HAVE_AES_DECRYPT */ + +/* END script replace AES-GCM Aarch64 with hardware crypto. */ + #endif /* HAVE_AESGCM */ diff --git a/wolfcrypt/test/test.c b/wolfcrypt/test/test.c index cc8280df7..bfbb81875 100644 --- a/wolfcrypt/test/test.c +++ b/wolfcrypt/test/test.c @@ -12258,10 +12258,10 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_test(void) #if defined(WOLFSSL_ASYNC_CRYPT) ret = wc_AsyncWait(ret, &dec->asyncDev, WC_ASYNC_FLAG_NONE); #endif - if (ret != 0) - ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); if (XMEMCMP(large_input, large_outdec, BENCH_AESGCM_LARGE)) ERROR_OUT(WC_TEST_RET_ENC_NC, out); + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); #endif /* HAVE_AES_DECRYPT */ #endif /* BENCH_AESGCM_LARGE */ #if defined(ENABLE_NON_12BYTE_IV_TEST) && defined(WOLFSSL_AES_256) @@ -12443,6 +12443,38 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_test(void) ERROR_OUT(WC_TEST_RET_ENC_NC, out); #endif /* HAVE_AES_DECRYPT */ + /* Large buffer test */ +#ifdef BENCH_AESGCM_LARGE + wc_AesGcmSetKey(enc, k2, k3Sz); + wc_AesGcmSetKey(dec, k2, k3Sz); + /* setup test buffer */ + for (alen=0; alenasyncDev, WC_ASYNC_FLAG_NONE); +#endif + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + +#ifdef HAVE_AES_DECRYPT + ret = wc_AesGcmDecrypt(dec, large_outdec, large_output, + BENCH_AESGCM_LARGE, iv1, sizeof(iv1), resultT, + sizeof(t1), a, sizeof(a)); +#if defined(WOLFSSL_ASYNC_CRYPT) + ret = wc_AsyncWait(ret, &dec->asyncDev, WC_ASYNC_FLAG_NONE); +#endif + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + if (XMEMCMP(large_input, large_outdec, BENCH_AESGCM_LARGE)) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); +#endif /* HAVE_AES_DECRYPT */ +#endif /* BENCH_AESGCM_LARGE */ + XMEMSET(resultT, 0, sizeof(resultT)); XMEMSET(resultC, 0, sizeof(resultC)); XMEMSET(resultP, 0, sizeof(resultP)); @@ -12475,6 +12507,38 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t aesgcm_test(void) if (XMEMCMP(p3, resultP, sizeof(p3))) ERROR_OUT(WC_TEST_RET_ENC_NC, out); #endif /* HAVE_AES_DECRYPT */ + + /* Large buffer test */ +#ifdef BENCH_AESGCM_LARGE + wc_AesGcmSetKey(enc, k3, k3Sz); + wc_AesGcmSetKey(dec, k3, k3Sz); + /* setup test buffer */ + for (alen=0; alenasyncDev, WC_ASYNC_FLAG_NONE); +#endif + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + +#ifdef HAVE_AES_DECRYPT + ret = wc_AesGcmDecrypt(dec, large_outdec, large_output, + BENCH_AESGCM_LARGE, iv1, sizeof(iv1), resultT, + sizeof(t1), a, sizeof(a)); +#if defined(WOLFSSL_ASYNC_CRYPT) + ret = wc_AsyncWait(ret, &dec->asyncDev, WC_ASYNC_FLAG_NONE); +#endif + if (ret != 0) + ERROR_OUT(WC_TEST_RET_ENC_EC(ret), out); + if (XMEMCMP(large_input, large_outdec, BENCH_AESGCM_LARGE)) + ERROR_OUT(WC_TEST_RET_ENC_NC, out); +#endif /* HAVE_AES_DECRYPT */ +#endif /* BENCH_AESGCM_LARGE */ #endif /* WOLFSSL_AES_128 */ #endif /* ENABLE_NON_12BYTE_IV_TEST */