diff --git a/wolfcrypt/src/aes.c b/wolfcrypt/src/aes.c index d61233650..47cd80edb 100755 --- a/wolfcrypt/src/aes.c +++ b/wolfcrypt/src/aes.c @@ -4550,195 +4550,194 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, __asm__ __volatile__ ( "vmovaps (%[pctr1]), %%xmm0\n\t" "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" - "vpshufb %%xmm1, %%xmm0, %0\n\t" - "vpaddd %[ONE], %%xmm0, %1\n\t" - "vpshufb %%xmm1, %1, %1\n\t" - "vpaddd %[TWO], %%xmm0, %2\n\t" - "vpshufb %%xmm1, %2, %2\n\t" - "vpaddd %[THREE], %%xmm0, %3\n\t" - "vpshufb %%xmm1, %3, %3\n\t" - "vpaddd %[FOUR], %%xmm0, %4\n\t" - "vpshufb %%xmm1, %4, %4\n\t" - "vpaddd %[FIVE], %%xmm0, %5\n\t" - "vpshufb %%xmm1, %5, %5\n\t" - "vpaddd %[SIX], %%xmm0, %6\n\t" - "vpshufb %%xmm1, %6, %6\n\t" - "vpaddd %[SEVEN], %%xmm0, %7\n\t" - "vpshufb %%xmm1, %7, %7\n\t" + "vpshufb %%xmm1, %%xmm0, %%xmm4\n\t" + "vpaddd %[ONE], %%xmm0, %%xmm5\n\t" + "vpshufb %%xmm1, %%xmm5, %%xmm5\n\t" + "vpaddd %[TWO], %%xmm0, %%xmm6\n\t" + "vpshufb %%xmm1, %%xmm6, %%xmm6\n\t" + "vpaddd %[THREE], %%xmm0, %%xmm7\n\t" + "vpshufb %%xmm1, %%xmm7, %%xmm7\n\t" + "vpaddd %[FOUR], %%xmm0, %%xmm8\n\t" + "vpshufb %%xmm1, %%xmm8, %%xmm8\n\t" + "vpaddd %[FIVE], %%xmm0, %%xmm9\n\t" + "vpshufb %%xmm1, %%xmm9, %%xmm9\n\t" + "vpaddd %[SIX], %%xmm0, %%xmm10\n\t" + "vpshufb %%xmm1, %%xmm10, %%xmm10\n\t" + "vpaddd %[SEVEN], %%xmm0, %%xmm11\n\t" + "vpshufb %%xmm1, %%xmm11, %%xmm11\n\t" "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" "vmovaps (%[KEY]), %%xmm1\n\t" "vmovaps %%xmm0, (%[pctr1])\n\t" - "vpxor %%xmm1, %0, %0\n\t" - "vpxor %%xmm1, %1, %1\n\t" - "vpxor %%xmm1, %2, %2\n\t" - "vpxor %%xmm1, %3, %3\n\t" - "vpxor %%xmm1, %4, %4\n\t" - "vpxor %%xmm1, %5, %5\n\t" - "vpxor %%xmm1, %6, %6\n\t" - "vpxor %%xmm1, %7, %7\n\t" + "vpxor %%xmm1, %%xmm4, %%xmm4\n\t" + "vpxor %%xmm1, %%xmm5, %%xmm5\n\t" + "vpxor %%xmm1, %%xmm6, %%xmm6\n\t" + "vpxor %%xmm1, %%xmm7, %%xmm7\n\t" + "vpxor %%xmm1, %%xmm8, %%xmm8\n\t" + "vpxor %%xmm1, %%xmm9, %%xmm9\n\t" + "vpxor %%xmm1, %%xmm10, %%xmm10\n\t" + "vpxor %%xmm1, %%xmm11, %%xmm11\n\t" "vmovaps 16(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 32(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 48(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 64(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 80(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 96(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 112(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 128(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 144(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "cmpl $11, %[nr]\n\t" "vmovaps 160(%[KEY]), %%xmm12\n\t" "jl L_enc128_enclast\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 176(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "cmpl $13, %[nr]\n\t" "vmovaps 192(%[KEY]), %%xmm12\n\t" "jl L_enc128_enclast\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 208(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 224(%[KEY]), %%xmm12\n\t" "\n" "L_enc128_enclast:\n\t" - "vaesenclast %%xmm12, %0, %0\n\t" - "vaesenclast %%xmm12, %1, %1\n\t" - "vpxor (%[in]), %0, %0\n\t" - "vpxor 16(%[in]), %1, %1\n\t" - "vmovdqu %0, (%[out])\n\t" - "vmovdqu %1, 16(%[out])\n\t" - "vaesenclast %%xmm12, %2, %2\n\t" - "vaesenclast %%xmm12, %3, %3\n\t" - "vpxor 32(%[in]), %2, %2\n\t" - "vpxor 48(%[in]), %3, %3\n\t" - "vmovdqu %2, 32(%[out])\n\t" - "vmovdqu %3, 48(%[out])\n\t" - "vaesenclast %%xmm12, %4, %4\n\t" - "vaesenclast %%xmm12, %5, %5\n\t" - "vpxor 64(%[in]), %4, %4\n\t" - "vpxor 80(%[in]), %5, %5\n\t" - "vmovdqu %4, 64(%[out])\n\t" - "vmovdqu %5, 80(%[out])\n\t" - "vaesenclast %%xmm12, %6, %6\n\t" - "vaesenclast %%xmm12, %7, %7\n\t" - "vpxor 96(%[in]), %6, %6\n\t" - "vpxor 112(%[in]), %7, %7\n\t" - "vmovdqu %6, 96(%[out])\n\t" - "vmovdqu %7, 112(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenclast %%xmm12, %%xmm5, %%xmm5\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vpxor 16(%[in]), %%xmm5, %%xmm5\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" + "vmovdqu %%xmm5, 16(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenclast %%xmm12, %%xmm7, %%xmm7\n\t" + "vpxor 32(%[in]), %%xmm6, %%xmm6\n\t" + "vpxor 48(%[in]), %%xmm7, %%xmm7\n\t" + "vmovdqu %%xmm6, 32(%[out])\n\t" + "vmovdqu %%xmm7, 48(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenclast %%xmm12, %%xmm9, %%xmm9\n\t" + "vpxor 64(%[in]), %%xmm8, %%xmm8\n\t" + "vpxor 80(%[in]), %%xmm9, %%xmm9\n\t" + "vmovdqu %%xmm8, 64(%[out])\n\t" + "vmovdqu %%xmm9, 80(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenclast %%xmm12, %%xmm11, %%xmm11\n\t" + "vpxor 96(%[in]), %%xmm10, %%xmm10\n\t" + "vpxor 112(%[in]), %%xmm11, %%xmm11\n\t" + "vmovdqu %%xmm10, 96(%[out])\n\t" + "vmovdqu %%xmm11, 112(%[out])\n\t" - : "=xr" (tmp1), "=xr" (tmp2), "=xr" (tmp3), "=xr" (tmp4), - "=xr" (tmp5), "=xr" (tmp6), "=xr" (tmp7), "=xr" (tmp8) + : : [KEY] "r" (KEY), [pctr1] "r" (pctr1), [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), [BSWAP_EPI64] "xrm" (BSWAP_EPI64), @@ -4747,6 +4746,8 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, [FIVE] "xrm" (FIVE), [SIX] "xrm" (SIX), [SEVEN] "xrm" (SEVEN), [EIGHT] "xrm" (EIGHT) : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm11", "xmm10", "xmm9", "xmm8", + "xmm7", "xmm6", "xmm5", "xmm4", "xmm0", "xmm1", "xmm3", "memory" ); @@ -4755,316 +4756,314 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, __asm__ __volatile__ ( "vmovaps (%[pctr1]), %%xmm0\n\t" "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" - "vpshufb %%xmm1, %%xmm0, %0\n\t" - "vpaddd %[ONE], %%xmm0, %1\n\t" - "vpshufb %%xmm1, %1, %1\n\t" - "vpaddd %[TWO], %%xmm0, %2\n\t" - "vpshufb %%xmm1, %2, %2\n\t" - "vpaddd %[THREE], %%xmm0, %3\n\t" - "vpshufb %%xmm1, %3, %3\n\t" - "vpaddd %[FOUR], %%xmm0, %4\n\t" - "vpshufb %%xmm1, %4, %4\n\t" - "vpaddd %[FIVE], %%xmm0, %5\n\t" - "vpshufb %%xmm1, %5, %5\n\t" - "vpaddd %[SIX], %%xmm0, %6\n\t" - "vpshufb %%xmm1, %6, %6\n\t" - "vpaddd %[SEVEN], %%xmm0, %7\n\t" - "vpshufb %%xmm1, %7, %7\n\t" + "vpshufb %%xmm1, %%xmm0, %%xmm4\n\t" + "vpaddd %[ONE], %%xmm0, %%xmm5\n\t" + "vpshufb %%xmm1, %%xmm5, %%xmm5\n\t" + "vpaddd %[TWO], %%xmm0, %%xmm6\n\t" + "vpshufb %%xmm1, %%xmm6, %%xmm6\n\t" + "vpaddd %[THREE], %%xmm0, %%xmm7\n\t" + "vpshufb %%xmm1, %%xmm7, %%xmm7\n\t" + "vpaddd %[FOUR], %%xmm0, %%xmm8\n\t" + "vpshufb %%xmm1, %%xmm8, %%xmm8\n\t" + "vpaddd %[FIVE], %%xmm0, %%xmm9\n\t" + "vpshufb %%xmm1, %%xmm9, %%xmm9\n\t" + "vpaddd %[SIX], %%xmm0, %%xmm10\n\t" + "vpshufb %%xmm1, %%xmm10, %%xmm10\n\t" + "vpaddd %[SEVEN], %%xmm0, %%xmm11\n\t" + "vpshufb %%xmm1, %%xmm11, %%xmm11\n\t" "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" "vmovaps (%[KEY]), %%xmm1\n\t" "vmovaps %%xmm0, (%[pctr1])\n\t" - "vpxor %%xmm1, %0, %0\n\t" - "vpxor %%xmm1, %1, %1\n\t" - "vpxor %%xmm1, %2, %2\n\t" - "vpxor %%xmm1, %3, %3\n\t" - "vpxor %%xmm1, %4, %4\n\t" - "vpxor %%xmm1, %5, %5\n\t" - "vpxor %%xmm1, %6, %6\n\t" - "vpxor %%xmm1, %7, %7\n\t" + "vpxor %%xmm1, %%xmm4, %%xmm4\n\t" + "vpxor %%xmm1, %%xmm5, %%xmm5\n\t" + "vpxor %%xmm1, %%xmm6, %%xmm6\n\t" + "vpxor %%xmm1, %%xmm7, %%xmm7\n\t" + "vpxor %%xmm1, %%xmm8, %%xmm8\n\t" + "vpxor %%xmm1, %%xmm9, %%xmm9\n\t" + "vpxor %%xmm1, %%xmm10, %%xmm10\n\t" + "vpxor %%xmm1, %%xmm11, %%xmm11\n\t" "vmovaps 16(%[KEY]), %%xmm12\n\t" "vmovdqu -128(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 112(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" "vpxor %[XV], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm2\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 32(%[KEY]), %%xmm12\n\t" "vmovdqu -112(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 96(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 48(%[KEY]), %%xmm12\n\t" "vmovdqu -96(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 80(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 64(%[KEY]), %%xmm12\n\t" "vmovdqu -80(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 64(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 80(%[KEY]), %%xmm12\n\t" "vmovdqu -64(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 48(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 96(%[KEY]), %%xmm12\n\t" "vmovdqu -48(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 32(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 112(%[KEY]), %%xmm12\n\t" "vmovdqu -32(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 16(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 128(%[KEY]), %%xmm12\n\t" "vmovdqu -16(%[out]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps (%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 144(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovdqa %[MOD2_128], %%xmm0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpshufd $78, %%xmm2, %%xmm13\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpshufd $78, %%xmm13, %%xmm13\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vmovdqa %%xmm13, %%xmm2\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "cmpl $11, %[nr]\n\t" "vmovaps 160(%[KEY]), %%xmm12\n\t" "jl %=f\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 176(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "cmpl $13, %[nr]\n\t" "vmovaps 192(%[KEY]), %%xmm12\n\t" "jl %=f\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 208(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 224(%[KEY]), %%xmm12\n\t" "%=:\n\t" - "vaesenclast %%xmm12, %0, %0\n\t" - "vaesenclast %%xmm12, %1, %1\n\t" - "vpxor (%[in]), %0, %0\n\t" - "vpxor 16(%[in]), %1, %1\n\t" - "vmovdqu %0, (%[out])\n\t" - "vmovdqu %1, 16(%[out])\n\t" - "vaesenclast %%xmm12, %2, %2\n\t" - "vaesenclast %%xmm12, %3, %3\n\t" - "vpxor 32(%[in]), %2, %2\n\t" - "vpxor 48(%[in]), %3, %3\n\t" - "vmovdqu %2, 32(%[out])\n\t" - "vmovdqu %3, 48(%[out])\n\t" - "vaesenclast %%xmm12, %4, %4\n\t" - "vaesenclast %%xmm12, %5, %5\n\t" - "vpxor 64(%[in]), %4, %4\n\t" - "vpxor 80(%[in]), %5, %5\n\t" - "vmovdqu %4, 64(%[out])\n\t" - "vmovdqu %5, 80(%[out])\n\t" - "vaesenclast %%xmm12, %6, %6\n\t" - "vaesenclast %%xmm12, %7, %7\n\t" - "vpxor 96(%[in]), %6, %6\n\t" - "vpxor 112(%[in]), %7, %7\n\t" - "vmovdqu %6, 96(%[out])\n\t" - "vmovdqu %7, 112(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenclast %%xmm12, %%xmm5, %%xmm5\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vpxor 16(%[in]), %%xmm5, %%xmm5\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" + "vmovdqu %%xmm5, 16(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenclast %%xmm12, %%xmm7, %%xmm7\n\t" + "vpxor 32(%[in]), %%xmm6, %%xmm6\n\t" + "vpxor 48(%[in]), %%xmm7, %%xmm7\n\t" + "vmovdqu %%xmm6, 32(%[out])\n\t" + "vmovdqu %%xmm7, 48(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenclast %%xmm12, %%xmm9, %%xmm9\n\t" + "vpxor 64(%[in]), %%xmm8, %%xmm8\n\t" + "vpxor 80(%[in]), %%xmm9, %%xmm9\n\t" + "vmovdqu %%xmm8, 64(%[out])\n\t" + "vmovdqu %%xmm9, 80(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenclast %%xmm12, %%xmm11, %%xmm11\n\t" + "vpxor 96(%[in]), %%xmm10, %%xmm10\n\t" + "vpxor 112(%[in]), %%xmm11, %%xmm11\n\t" + "vmovdqu %%xmm10, 96(%[out])\n\t" + "vmovdqu %%xmm11, 112(%[out])\n\t" - : "=xr" (tmp1), "=xr" (tmp2), "=xr" (tmp3), "=xr" (tmp4), - "=xr" (tmp5), "=xr" (tmp6), "=xr" (tmp7), "=xr" (tmp8), - [XV] "+xr" (XV) + : [XV] "+xr" (XV) : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), [BSWAP_MASK] "xrm" (BSWAP_MASK), @@ -5075,6 +5074,8 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, [SEVEN] "xrm" (SEVEN), [EIGHT] "xrm" (EIGHT), [MOD2_128] "xrm" (MOD2_128) : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm11", "xmm10", "xmm9", "xmm8", + "xmm7", "xmm6", "xmm5", "xmm4", "xmm0", "xmm1", "xmm3", "memory" ); } @@ -5094,36 +5095,36 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, } for (k = i*8; k < (int)(nbytes/16); k++) { __asm__ __volatile__ ( - "vpshufb %[BSWAP_EPI64], %[ctr1], %0\n\t" + "vpshufb %[BSWAP_EPI64], %[ctr1], %%xmm4\n\t" "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" - "vpxor (%[KEY]), %0, %0\n\t" - "vaesenc 16(%[KEY]), %0, %0\n\t" - "vaesenc 32(%[KEY]), %0, %0\n\t" - "vaesenc 48(%[KEY]), %0, %0\n\t" - "vaesenc 64(%[KEY]), %0, %0\n\t" - "vaesenc 80(%[KEY]), %0, %0\n\t" - "vaesenc 96(%[KEY]), %0, %0\n\t" - "vaesenc 112(%[KEY]), %0, %0\n\t" - "vaesenc 128(%[KEY]), %0, %0\n\t" - "vaesenc 144(%[KEY]), %0, %0\n\t" + "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" "cmpl $11, %[nr]\n\t" - "vmovaps 160(%[KEY]), %1\n\t" + "vmovaps 160(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 176(%[KEY]), %0, %0\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" "cmpl $13, %[nr]\n\t" - "vmovaps 192(%[KEY]), %1\n\t" + "vmovaps 192(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 208(%[KEY]), %0, %0\n\t" - "vmovaps 224(%[KEY]), %1\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" + "vmovaps 224(%[KEY]), %%xmm5\n\t" "%=:\n\t" - "vaesenclast %1, %0, %0\n\t" - "vpxor (%[in]), %0, %0\n\t" - "vmovdqu %0, (%[out])\n\t" - "vpshufb %[BSWAP_MASK], %0, %0\n\t" + "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" - "vpxor %0, %[X], %[X]\n\t" + "vpxor %%xmm4, %[X], %[X]\n\t" "# Carryless Multiply X by H (128 x 128)\n\t" "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" @@ -5146,7 +5147,7 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, "vmovdqa %%xmm13, %[X]\n\t" "# End Reduce\n\t" - : "+xr" (tmp1), "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), + : [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) : [KEY] "r" (KEY), [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), @@ -5155,43 +5156,44 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, [ONE] "xrm" (ONE), [MOD2_128] "xrm" (MOD2_128) : "xmm15", "xmm14", "xmm13", + "xmm5", "xmm4", "xmm0", "xmm1", "xmm2", "xmm3", "memory" ); } #else for (k = 0; k < (int)(nbytes/16) && k < 1; k++) { __asm__ __volatile__ ( - "vpshufb %[BSWAP_EPI64], %[ctr1], %0\n\t" + "vpshufb %[BSWAP_EPI64], %[ctr1], %%xmm4\n\t" "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" - "vpxor (%[KEY]), %0, %0\n\t" - "vaesenc 16(%[KEY]), %0, %0\n\t" - "vaesenc 32(%[KEY]), %0, %0\n\t" - "vaesenc 48(%[KEY]), %0, %0\n\t" - "vaesenc 64(%[KEY]), %0, %0\n\t" - "vaesenc 80(%[KEY]), %0, %0\n\t" - "vaesenc 96(%[KEY]), %0, %0\n\t" - "vaesenc 112(%[KEY]), %0, %0\n\t" - "vaesenc 128(%[KEY]), %0, %0\n\t" - "vaesenc 144(%[KEY]), %0, %0\n\t" + "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" "cmpl $11, %[nr]\n\t" - "vmovaps 160(%[KEY]), %1\n\t" + "vmovaps 160(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 176(%[KEY]), %0, %0\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" "cmpl $13, %[nr]\n\t" - "vmovaps 192(%[KEY]), %1\n\t" + "vmovaps 192(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 208(%[KEY]), %0, %0\n\t" - "vmovaps 224(%[KEY]), %1\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" + "vmovaps 224(%[KEY]), %%xmm5\n\t" "%=:\n\t" - "vaesenclast %1, %0, %0\n\t" - "vpxor (%[in]), %0, %0\n\t" - "vmovdqu %0, (%[out])\n\t" - "vpshufb %[BSWAP_MASK], %0, %0\n\t" - "vpxor %0, %[X], %[X]\n\t" + "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" + "vpxor %%xmm4, %[X], %[X]\n\t" - : "+xr" (tmp1), "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), + : [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) : [KEY] "r" (KEY), [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), @@ -5199,60 +5201,60 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, [BSWAP_EPI64] "xrm" (BSWAP_EPI64), [ONE] "xrm" (ONE), [MOD2_128] "xrm" (MOD2_128) - : "memory" + : "xmm4", "xxm5", "memory" ); } for (; k < (int)(nbytes/16); k++) { __asm__ __volatile__ ( - "vpshufb %[BSWAP_EPI64], %[ctr1], %0\n\t" + "vpshufb %[BSWAP_EPI64], %[ctr1], %%xmm4\n\t" "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" - "vpxor (%[KEY]), %0, %0\n\t" - "vaesenc 16(%[KEY]), %0, %0\n\t" + "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $16, %[H], %[X], %%xmm13\n\t" - "vaesenc 32(%[KEY]), %0, %0\n\t" + "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $1, %[H], %[X], %%xmm14\n\t" - "vaesenc 48(%[KEY]), %0, %0\n\t" + "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $0, %[H], %[X], %%xmm15\n\t" - "vaesenc 64(%[KEY]), %0, %0\n\t" + "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $17, %[H], %[X], %%xmm1\n\t" - "vaesenc 80(%[KEY]), %0, %0\n\t" + "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm2\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc 96(%[KEY]), %0, %0\n\t" + "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" "vmovdqa %[MOD2_128], %%xmm0\n\t" "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" - "vaesenc 112(%[KEY]), %0, %0\n\t" + "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" "vpshufd $78, %%xmm2, %%xmm13\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" - "vaesenc 128(%[KEY]), %0, %0\n\t" + "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" "vpshufd $78, %%xmm13, %%xmm13\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" - "vaesenc 144(%[KEY]), %0, %0\n\t" + "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" "vmovdqa %%xmm13, %[X]\n\t" "cmpl $11, %[nr]\n\t" - "vmovaps 160(%[KEY]), %1\n\t" + "vmovaps 160(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 176(%[KEY]), %0, %0\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" "cmpl $13, %[nr]\n\t" - "vmovaps 192(%[KEY]), %1\n\t" + "vmovaps 192(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 208(%[KEY]), %0, %0\n\t" - "vmovaps 224(%[KEY]), %1\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" + "vmovaps 224(%[KEY]), %%xmm5\n\t" "%=:\n\t" - "vaesenclast %1, %0, %0\n\t" - "vpxor (%[in]), %0, %0\n\t" - "vmovdqu %0, (%[out])\n\t" - "vpshufb %[BSWAP_MASK], %0, %0\n\t" - "vpxor %0, %[X], %[X]\n\t" + "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" + "vpshufb %[BSWAP_MASK], %%xmm4, %%xmm4\n\t" + "vpxor %%xmm4, %[X], %[X]\n\t" - : "+xr" (tmp1), "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), + : [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) : [KEY] "r" (KEY), [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), @@ -5260,7 +5262,7 @@ static void AES_GCM_encrypt_avx2(const unsigned char *in, unsigned char *out, [BSWAP_EPI64] "xrm" (BSWAP_EPI64), [ONE] "xrm" (ONE), [MOD2_128] "xrm" (MOD2_128) - : "xmm15", "xmm14", "xmm13", + : "xmm15", "xmm14", "xmm13", "xmm4", "xmm5", "xmm0", "xmm1", "xmm2", "xmm3", "memory" ); } @@ -5790,20 +5792,13 @@ static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, __m128i ctr1; __m128i last_block = _mm_setzero_si128(); __m128i X = _mm_setzero_si128(); + __m128i tmp1, tmp2; #ifndef AES_GCM_AESNI_NO_UNROLL __m128i HT[8]; - register __m128i tmp1 asm("xmm4"); - register __m128i tmp2 asm("xmm5"); - register __m128i tmp3 asm("xmm6"); - register __m128i tmp4 asm("xmm7"); - register __m128i tmp5 asm("xmm8"); - register __m128i tmp6 asm("xmm9"); - register __m128i tmp7 asm("xmm10"); - register __m128i tmp8 asm("xmm11"); __m128i pctr1[1]; register __m128i XV asm("xmm2"); #else - __m128i tmp1, tmp2, XV; + __m128i XV; #endif if (ibytes == 12) { @@ -5962,316 +5957,314 @@ static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, __asm__ __volatile__ ( "vmovaps (%[pctr1]), %%xmm0\n\t" "vmovaps %[BSWAP_EPI64], %%xmm1\n\t" - "vpshufb %%xmm1, %%xmm0, %0\n\t" - "vpaddd %[ONE], %%xmm0, %1\n\t" - "vpshufb %%xmm1, %1, %1\n\t" - "vpaddd %[TWO], %%xmm0, %2\n\t" - "vpshufb %%xmm1, %2, %2\n\t" - "vpaddd %[THREE], %%xmm0, %3\n\t" - "vpshufb %%xmm1, %3, %3\n\t" - "vpaddd %[FOUR], %%xmm0, %4\n\t" - "vpshufb %%xmm1, %4, %4\n\t" - "vpaddd %[FIVE], %%xmm0, %5\n\t" - "vpshufb %%xmm1, %5, %5\n\t" - "vpaddd %[SIX], %%xmm0, %6\n\t" - "vpshufb %%xmm1, %6, %6\n\t" - "vpaddd %[SEVEN], %%xmm0, %7\n\t" - "vpshufb %%xmm1, %7, %7\n\t" + "vpshufb %%xmm1, %%xmm0, %%xmm4\n\t" + "vpaddd %[ONE], %%xmm0, %%xmm5\n\t" + "vpshufb %%xmm1, %%xmm5, %%xmm5\n\t" + "vpaddd %[TWO], %%xmm0, %%xmm6\n\t" + "vpshufb %%xmm1, %%xmm6, %%xmm6\n\t" + "vpaddd %[THREE], %%xmm0, %%xmm7\n\t" + "vpshufb %%xmm1, %%xmm7, %%xmm7\n\t" + "vpaddd %[FOUR], %%xmm0, %%xmm8\n\t" + "vpshufb %%xmm1, %%xmm8, %%xmm8\n\t" + "vpaddd %[FIVE], %%xmm0, %%xmm9\n\t" + "vpshufb %%xmm1, %%xmm9, %%xmm9\n\t" + "vpaddd %[SIX], %%xmm0, %%xmm10\n\t" + "vpshufb %%xmm1, %%xmm10, %%xmm10\n\t" + "vpaddd %[SEVEN], %%xmm0, %%xmm11\n\t" + "vpshufb %%xmm1, %%xmm11, %%xmm11\n\t" "vpaddd %[EIGHT], %%xmm0, %%xmm0\n\t" "vmovaps (%[KEY]), %%xmm1\n\t" "vmovaps %%xmm0, (%[pctr1])\n\t" - "vpxor %%xmm1, %0, %0\n\t" - "vpxor %%xmm1, %1, %1\n\t" - "vpxor %%xmm1, %2, %2\n\t" - "vpxor %%xmm1, %3, %3\n\t" - "vpxor %%xmm1, %4, %4\n\t" - "vpxor %%xmm1, %5, %5\n\t" - "vpxor %%xmm1, %6, %6\n\t" - "vpxor %%xmm1, %7, %7\n\t" + "vpxor %%xmm1, %%xmm4, %%xmm4\n\t" + "vpxor %%xmm1, %%xmm5, %%xmm5\n\t" + "vpxor %%xmm1, %%xmm6, %%xmm6\n\t" + "vpxor %%xmm1, %%xmm7, %%xmm7\n\t" + "vpxor %%xmm1, %%xmm8, %%xmm8\n\t" + "vpxor %%xmm1, %%xmm9, %%xmm9\n\t" + "vpxor %%xmm1, %%xmm10, %%xmm10\n\t" + "vpxor %%xmm1, %%xmm11, %%xmm11\n\t" "vmovaps 16(%[KEY]), %%xmm12\n\t" "vmovdqu (%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 112(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" "vpxor %[XV], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm2\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 32(%[KEY]), %%xmm12\n\t" "vmovdqu 16(%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 96(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 48(%[KEY]), %%xmm12\n\t" "vmovdqu 32(%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 80(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 64(%[KEY]), %%xmm12\n\t" "vmovdqu 48(%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 64(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 80(%[KEY]), %%xmm12\n\t" "vmovdqu 64(%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 48(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 96(%[KEY]), %%xmm12\n\t" "vmovdqu 80(%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 32(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 112(%[KEY]), %%xmm12\n\t" "vmovdqu 96(%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps 16(%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 128(%[KEY]), %%xmm12\n\t" "vmovdqu 112(%[in]), %%xmm1\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovaps (%[HT]), %%xmm0\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm14\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %6, %6\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm1, %%xmm3, %%xmm3\n\t" "vpxor %%xmm14, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm3, %%xmm3\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 144(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" "vmovdqa %[MOD2_128], %%xmm0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" - "vaesenc %%xmm12, %2, %2\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" "vpshufd $78, %%xmm2, %%xmm13\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %3, %3\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" "vpclmulqdq $16, %%xmm0, %%xmm13, %%xmm14\n\t" - "vaesenc %%xmm12, %4, %4\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" "vpshufd $78, %%xmm13, %%xmm13\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpxor %%xmm3, %%xmm13, %%xmm13\n\t" - "vaesenc %%xmm12, %5, %5\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" "vmovdqa %%xmm13, %%xmm2\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "cmpl $11, %[nr]\n\t" "vmovaps 160(%[KEY]), %%xmm12\n\t" "jl %=f\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 176(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "cmpl $13, %[nr]\n\t" "vmovaps 192(%[KEY]), %%xmm12\n\t" "jl %=f\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 208(%[KEY]), %%xmm12\n\t" - "vaesenc %%xmm12, %0, %0\n\t" - "vaesenc %%xmm12, %1, %1\n\t" - "vaesenc %%xmm12, %2, %2\n\t" - "vaesenc %%xmm12, %3, %3\n\t" - "vaesenc %%xmm12, %4, %4\n\t" - "vaesenc %%xmm12, %5, %5\n\t" - "vaesenc %%xmm12, %6, %6\n\t" - "vaesenc %%xmm12, %7, %7\n\t" + "vaesenc %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenc %%xmm12, %%xmm5, %%xmm5\n\t" + "vaesenc %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenc %%xmm12, %%xmm7, %%xmm7\n\t" + "vaesenc %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenc %%xmm12, %%xmm9, %%xmm9\n\t" + "vaesenc %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenc %%xmm12, %%xmm11, %%xmm11\n\t" "vmovaps 224(%[KEY]), %%xmm12\n\t" "%=:\n\t" - "vaesenclast %%xmm12, %0, %0\n\t" - "vaesenclast %%xmm12, %1, %1\n\t" - "vpxor (%[in]), %0, %0\n\t" - "vpxor 16(%[in]), %1, %1\n\t" - "vmovdqu %0, (%[out])\n\t" - "vmovdqu %1, 16(%[out])\n\t" - "vaesenclast %%xmm12, %2, %2\n\t" - "vaesenclast %%xmm12, %3, %3\n\t" - "vpxor 32(%[in]), %2, %2\n\t" - "vpxor 48(%[in]), %3, %3\n\t" - "vmovdqu %2, 32(%[out])\n\t" - "vmovdqu %3, 48(%[out])\n\t" - "vaesenclast %%xmm12, %4, %4\n\t" - "vaesenclast %%xmm12, %5, %5\n\t" - "vpxor 64(%[in]), %4, %4\n\t" - "vpxor 80(%[in]), %5, %5\n\t" - "vmovdqu %4, 64(%[out])\n\t" - "vmovdqu %5, 80(%[out])\n\t" - "vaesenclast %%xmm12, %6, %6\n\t" - "vaesenclast %%xmm12, %7, %7\n\t" - "vpxor 96(%[in]), %6, %6\n\t" - "vpxor 112(%[in]), %7, %7\n\t" - "vmovdqu %6, 96(%[out])\n\t" - "vmovdqu %7, 112(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm4, %%xmm4\n\t" + "vaesenclast %%xmm12, %%xmm5, %%xmm5\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vpxor 16(%[in]), %%xmm5, %%xmm5\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" + "vmovdqu %%xmm5, 16(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm6, %%xmm6\n\t" + "vaesenclast %%xmm12, %%xmm7, %%xmm7\n\t" + "vpxor 32(%[in]), %%xmm6, %%xmm6\n\t" + "vpxor 48(%[in]), %%xmm7, %%xmm7\n\t" + "vmovdqu %%xmm6, 32(%[out])\n\t" + "vmovdqu %%xmm7, 48(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm8, %%xmm8\n\t" + "vaesenclast %%xmm12, %%xmm9, %%xmm9\n\t" + "vpxor 64(%[in]), %%xmm8, %%xmm8\n\t" + "vpxor 80(%[in]), %%xmm9, %%xmm9\n\t" + "vmovdqu %%xmm8, 64(%[out])\n\t" + "vmovdqu %%xmm9, 80(%[out])\n\t" + "vaesenclast %%xmm12, %%xmm10, %%xmm10\n\t" + "vaesenclast %%xmm12, %%xmm11, %%xmm11\n\t" + "vpxor 96(%[in]), %%xmm10, %%xmm10\n\t" + "vpxor 112(%[in]), %%xmm11, %%xmm11\n\t" + "vmovdqu %%xmm10, 96(%[out])\n\t" + "vmovdqu %%xmm11, 112(%[out])\n\t" - : "=xr" (tmp1), "=xr" (tmp2), "=xr" (tmp3), "=xr" (tmp4), - "=xr" (tmp5), "=xr" (tmp6), "=xr" (tmp7), "=xr" (tmp8), - [XV] "+xr" (XV) + : [XV] "+xr" (XV) : [KEY] "r" (KEY), [HT] "r" (HT), [pctr1] "r" (pctr1), [in] "r" (&in[i*16*8]), [out] "r" (&out[i*16*8]), [nr] "r" (nr), [BSWAP_MASK] "xrm" (BSWAP_MASK), @@ -6282,6 +6275,8 @@ static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, [SEVEN] "xrm" (SEVEN), [EIGHT] "xrm" (EIGHT), [MOD2_128] "xrm" (MOD2_128) : "xmm15", "xmm14", "xmm13", "xmm12", + "xmm11", "xmm10", "xmm9", "xmm8", + "xmm7", "xmm6", "xmm5", "xmm4", "xmm0", "xmm1", "xmm3", "memory" ); } @@ -6291,31 +6286,31 @@ static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, #endif for (k = i*8; k < nbytes/16; k++) { __asm__ __volatile__ ( - "vpshufb %[BSWAP_EPI64], %[ctr1], %0\n\t" + "vpshufb %[BSWAP_EPI64], %[ctr1], %%xmm4\n\t" "vpaddd %[ONE], %[ctr1], %[ctr1]\n\t" - "vpxor (%[KEY]), %0, %0\n\t" - "vaesenc 16(%[KEY]), %0, %0\n\t" + "vpxor (%[KEY]), %%xmm4, %%xmm4\n\t" + "vaesenc 16(%[KEY]), %%xmm4, %%xmm4\n\t" "vmovaps %[H], %%xmm0\n\t" "vmovdqu (%[in]), %%xmm1\n\t" - "vaesenc 32(%[KEY]), %0, %0\n\t" + "vaesenc 32(%[KEY]), %%xmm4, %%xmm4\n\t" "vpshufb %[BSWAP_MASK], %%xmm1, %%xmm1\n\t" "vpxor %[X], %%xmm1, %%xmm1\n\t" - "vaesenc 48(%[KEY]), %0, %0\n\t" + "vaesenc 48(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $16, %%xmm1, %%xmm0, %%xmm13\n\t" - "vaesenc 64(%[KEY]), %0, %0\n\t" + "vaesenc 64(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $1, %%xmm1, %%xmm0, %%xmm14\n\t" - "vaesenc 80(%[KEY]), %0, %0\n\t" + "vaesenc 80(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $0, %%xmm1, %%xmm0, %%xmm15\n\t" - "vaesenc 96(%[KEY]), %0, %0\n\t" + "vaesenc 96(%[KEY]), %%xmm4, %%xmm4\n\t" "vpclmulqdq $17, %%xmm1, %%xmm0, %%xmm1\n\t" - "vaesenc 112(%[KEY]), %0, %0\n\t" + "vaesenc 112(%[KEY]), %%xmm4, %%xmm4\n\t" "vpxor %%xmm14, %%xmm13, %%xmm13\n\t" "vpslldq $8, %%xmm13, %%xmm2\n\t" "vpsrldq $8, %%xmm13, %%xmm13\n\t" - "vaesenc 128(%[KEY]), %0, %0\n\t" + "vaesenc 128(%[KEY]), %%xmm4, %%xmm4\n\t" "vpxor %%xmm15, %%xmm2, %%xmm2\n\t" "vpxor %%xmm13, %%xmm1, %%xmm3\n\t" - "vaesenc 144(%[KEY]), %0, %0\n\t" + "vaesenc 144(%[KEY]), %%xmm4, %%xmm4\n\t" "# Reduce\n\t" "vmovdqa %[MOD2_128], %%xmm0\n\t" "vpclmulqdq $16, %%xmm0, %%xmm2, %%xmm14\n\t" @@ -6328,22 +6323,22 @@ static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, "vmovdqa %%xmm13, %[X]\n\t" "# End Reduce\n\t" "cmpl $11, %[nr]\n\t" - "vmovaps 160(%[KEY]), %1\n\t" + "vmovaps 160(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 176(%[KEY]), %0, %0\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 176(%[KEY]), %%xmm4, %%xmm4\n\t" "cmpl $13, %[nr]\n\t" - "vmovaps 192(%[KEY]), %1\n\t" + "vmovaps 192(%[KEY]), %%xmm5\n\t" "jl %=f\n\t" - "vaesenc %1, %0, %0\n\t" - "vaesenc 208(%[KEY]), %0, %0\n\t" - "vmovaps 224(%[KEY]), %1\n\t" + "vaesenc %%xmm5, %%xmm4, %%xmm4\n\t" + "vaesenc 208(%[KEY]), %%xmm4, %%xmm4\n\t" + "vmovaps 224(%[KEY]), %%xmm5\n\t" "%=:\n\t" - "vaesenclast %1, %0, %0\n\t" - "vpxor (%[in]), %0, %0\n\t" - "vmovdqu %0, (%[out])\n\t" + "vaesenclast %%xmm5, %%xmm4, %%xmm4\n\t" + "vpxor (%[in]), %%xmm4, %%xmm4\n\t" + "vmovdqu %%xmm4, (%[out])\n\t" - : "+xr" (tmp1), "=xr" (tmp2), [H] "+xr" (H), [X] "+xr" (X), + : [H] "+xr" (H), [X] "+xr" (X), [ctr1] "+xr" (ctr1) : [KEY] "r" (KEY), [in] "r" (&in[k*16]), [out] "r" (&out[k*16]), [nr] "r" (nr), @@ -6351,7 +6346,7 @@ static int AES_GCM_decrypt_avx2(const unsigned char *in, unsigned char *out, [BSWAP_EPI64] "xrm" (BSWAP_EPI64), [ONE] "xrm" (ONE), [MOD2_128] "xrm" (MOD2_128) - : "xmm15", "xmm14", "xmm13", + : "xmm15", "xmm14", "xmm13", "xmm4", "xmm5", "xmm0", "xmm1", "xmm2", "xmm3", "memory" ); }