Fix and improvements for X25519 x86_64 ASM code

This commit is contained in:
Sean Parkinson 2019-05-24 09:43:08 +10:00
parent 4e2e207e67
commit 6564d03369

View File

@ -157,16 +157,15 @@ fe_frombytes:
_fe_frombytes:
#endif /* __APPLE__ */
movq $0x7fffffffffffffff, %r9
# Copy
movq (%rsi), %rdx
movq 8(%rsi), %rax
movq 16(%rsi), %rcx
movq 24(%rsi), %r8
andq %r9, %r8
movq %rdx, (%rdi)
movq %rax, 8(%rdi)
movq %rcx, 16(%rdi)
movq %r8, 24(%rdi)
andq %r9, 24(%rdi)
repz retq
#ifndef __APPLE__
.size fe_frombytes,.-fe_frombytes
@ -1264,7 +1263,7 @@ _fe_mul_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -1415,7 +1414,7 @@ _fe_sq_x64:
movq $19, %rax
adcq %rdx, %r13
mulq %r14
# Add remaining produce results in
# Add remaining product results in
addq %r11, %r8
adcq %r12, %r9
adcq %r13, %r10
@ -1629,7 +1628,7 @@ _fe_sq2_x64:
mulq %r14
# Add remaining produce results in
addq %r15, %rcx
addq %r11, %r8
adcq %r11, %r8
adcq %r12, %r9
adcq %r13, %r10
adcq %rax, %r10
@ -2045,68 +2044,22 @@ L_curve25519_x64_bits:
xorq %r10, 48(%rsp)
xorq %r11, 56(%rsp)
movq %rbp, %rbx
# Sub
# Add
movq 64(%rsp), %rcx
movq 72(%rsp), %r9
movq 80(%rsp), %r10
movq 88(%rsp), %r11
subq 32(%rsp), %rcx
movq $0x00, %rbp
sbbq 40(%rsp), %r9
movq 88(%rsp), %rbp
movq %rcx, %r12
addq 32(%rsp), %rcx
movq %r9, %r13
adcq 40(%rsp), %r9
movq %r10, %r14
adcq 48(%rsp), %r10
movq %rbp, %r15
adcq 56(%rsp), %rbp
movq $-19, %rax
sbbq 48(%rsp), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 56(%rsp), %r11
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %rcx
adcq %rbp, %r9
adcq %rbp, %r10
adcq %rdx, %r11
movq %rcx, 96(%rsp)
movq %r9, 104(%rsp)
movq %r10, 112(%rsp)
movq %r11, 120(%rsp)
# Sub
movq (%rdi), %rcx
movq 8(%rdi), %r9
movq 16(%rdi), %r10
movq 24(%rdi), %r11
subq (%rsp), %rcx
movq $0x00, %rbp
sbbq 8(%rsp), %r9
movq $-19, %rax
sbbq 16(%rsp), %r10
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rsp), %r11
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %rcx
adcq %rbp, %r9
adcq %rbp, %r10
adcq %rdx, %r11
movq %rcx, 128(%rsp)
movq %r9, 136(%rsp)
movq %r10, 144(%rsp)
movq %r11, 152(%rsp)
# Add
movq (%rdi), %rcx
movq 8(%rdi), %r9
addq (%rsp), %rcx
movq 16(%rdi), %r10
adcq 8(%rsp), %r9
movq 24(%rdi), %rbp
adcq 16(%rsp), %r10
movq $-19, %rax
adcq 24(%rsp), %rbp
movq $0x7fffffffffffffff, %rdx
movq %rbp, %r11
movq $0x7fffffffffffffff, %rdx
sarq $63, %rbp
# Mask the modulus
andq %rbp, %rax
@ -2116,22 +2069,47 @@ L_curve25519_x64_bits:
sbbq %rbp, %r9
sbbq %rbp, %r10
sbbq %rdx, %r11
# Sub
subq 32(%rsp), %r12
movq $0x00, %rbp
sbbq 40(%rsp), %r13
movq $-19, %rax
sbbq 48(%rsp), %r14
movq $0x7fffffffffffffff, %rdx
sbbq 56(%rsp), %r15
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %r12
adcq %rbp, %r13
adcq %rbp, %r14
adcq %rdx, %r15
movq %rcx, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, 96(%rsp)
movq %r13, 104(%rsp)
movq %r14, 112(%rsp)
movq %r15, 120(%rsp)
# Add
movq 64(%rsp), %rcx
movq 72(%rsp), %r9
addq 32(%rsp), %rcx
movq 80(%rsp), %r10
adcq 40(%rsp), %r9
movq 88(%rsp), %rbp
adcq 48(%rsp), %r10
movq (%rdi), %rcx
movq 8(%rdi), %r9
movq 16(%rdi), %r10
movq 24(%rdi), %rbp
movq %rcx, %r12
addq (%rsp), %rcx
movq %r9, %r13
adcq 8(%rsp), %r9
movq %r10, %r14
adcq 16(%rsp), %r10
movq %rbp, %r15
adcq 24(%rsp), %rbp
movq $-19, %rax
adcq 56(%rsp), %rbp
movq $0x7fffffffffffffff, %rdx
movq %rbp, %r11
movq $0x7fffffffffffffff, %rdx
sarq $63, %rbp
# Mask the modulus
andq %rbp, %rax
@ -2141,10 +2119,31 @@ L_curve25519_x64_bits:
sbbq %rbp, %r9
sbbq %rbp, %r10
sbbq %rdx, %r11
# Sub
subq (%rsp), %r12
movq $0x00, %rbp
sbbq 8(%rsp), %r13
movq $-19, %rax
sbbq 16(%rsp), %r14
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rsp), %r15
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %r12
adcq %rbp, %r13
adcq %rbp, %r14
adcq %rdx, %r15
movq %rcx, (%rsp)
movq %r9, 8(%rsp)
movq %r10, 16(%rsp)
movq %r11, 24(%rsp)
movq %r12, 128(%rsp)
movq %r13, 136(%rsp)
movq %r14, 144(%rsp)
movq %r15, 152(%rsp)
# Multiply
# A[0] * B[0]
movq (%rdi), %rax
@ -2270,7 +2269,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -2423,7 +2422,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -2549,7 +2548,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -2675,7 +2674,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -2706,15 +2705,19 @@ L_curve25519_x64_bits:
# Add
movq 32(%rsp), %rcx
movq 40(%rsp), %r9
addq (%rsp), %rcx
movq 48(%rsp), %r10
adcq 8(%rsp), %r9
movq 56(%rsp), %rbp
movq %rcx, %r12
addq (%rsp), %rcx
movq %r9, %r13
adcq 8(%rsp), %r9
movq %r10, %r14
adcq 16(%rsp), %r10
movq $-19, %rax
movq %rbp, %r15
adcq 24(%rsp), %rbp
movq $0x7fffffffffffffff, %rdx
movq $-19, %rax
movq %rbp, %r11
movq $0x7fffffffffffffff, %rdx
sarq $63, %rbp
# Mask the modulus
andq %rbp, %rax
@ -2724,35 +2727,31 @@ L_curve25519_x64_bits:
sbbq %rbp, %r9
sbbq %rbp, %r10
sbbq %rdx, %r11
movq %rcx, 64(%rsp)
movq %r9, 72(%rsp)
movq %r10, 80(%rsp)
movq %r11, 88(%rsp)
# Sub
movq 32(%rsp), %rcx
movq 40(%rsp), %r9
movq 48(%rsp), %r10
movq 56(%rsp), %r11
subq (%rsp), %rcx
subq (%rsp), %r12
movq $0x00, %rbp
sbbq 8(%rsp), %r9
sbbq 8(%rsp), %r13
movq $-19, %rax
sbbq 16(%rsp), %r10
sbbq 16(%rsp), %r14
movq $0x7fffffffffffffff, %rdx
sbbq 24(%rsp), %r11
sbbq 24(%rsp), %r15
sbbq $0x00, %rbp
# Mask the modulus
andq %rbp, %rax
andq %rbp, %rdx
# Add modulus (if underflow)
addq %rax, %rcx
adcq %rbp, %r9
adcq %rbp, %r10
adcq %rdx, %r11
movq %rcx, (%rsp)
movq %r9, 8(%rsp)
movq %r10, 16(%rsp)
movq %r11, 24(%rsp)
addq %rax, %r12
adcq %rbp, %r13
adcq %rbp, %r14
adcq %rdx, %r15
movq %rcx, 64(%rsp)
movq %r9, 72(%rsp)
movq %r10, 80(%rsp)
movq %r11, 88(%rsp)
movq %r12, (%rsp)
movq %r13, 8(%rsp)
movq %r14, 16(%rsp)
movq %r15, 24(%rsp)
# Multiply
# A[0] * B[0]
movq 96(%rsp), %rax
@ -2878,7 +2877,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -3029,7 +3028,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -3188,7 +3187,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -3366,7 +3365,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -3519,7 +3518,7 @@ L_curve25519_x64_bits:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -3939,7 +3938,7 @@ L_curve25519_x64_inv_8:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -4405,7 +4404,7 @@ _fe_ge_to_p2_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -4561,7 +4560,7 @@ _fe_ge_to_p2_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -4717,7 +4716,7 @@ _fe_ge_to_p2_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -4905,7 +4904,7 @@ _fe_ge_to_p3_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -5061,7 +5060,7 @@ _fe_ge_to_p3_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -5217,7 +5216,7 @@ _fe_ge_to_p3_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -5373,7 +5372,7 @@ _fe_ge_to_p3_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -5535,7 +5534,7 @@ _fe_ge_dbl_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -5663,7 +5662,7 @@ _fe_ge_dbl_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -5803,7 +5802,7 @@ _fe_ge_dbl_x64:
mulq %r15
# Add remaining produce results in
addq %rcx, %r8
addq %r12, %r9
adcq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
adcq %rax, %r11
@ -5958,7 +5957,7 @@ _fe_ge_dbl_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -6316,7 +6315,7 @@ _fe_ge_madd_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -6472,7 +6471,7 @@ _fe_ge_madd_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -6628,7 +6627,7 @@ _fe_ge_madd_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -7014,7 +7013,7 @@ _fe_ge_msub_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -7170,7 +7169,7 @@ _fe_ge_msub_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -7326,7 +7325,7 @@ _fe_ge_msub_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -7712,7 +7711,7 @@ _fe_ge_add_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -7868,7 +7867,7 @@ _fe_ge_add_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -8024,7 +8023,7 @@ _fe_ge_add_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -8180,7 +8179,7 @@ _fe_ge_add_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -8566,7 +8565,7 @@ _fe_ge_sub_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -8722,7 +8721,7 @@ _fe_ge_sub_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -8878,7 +8877,7 @@ _fe_ge_sub_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -9034,7 +9033,7 @@ _fe_ge_sub_x64:
movq $19, %rax
adcq %rdx, %r14
mulq %r15
# Add remaining produce results in
# Add remaining product results in
addq %r12, %r9
adcq %r13, %r10
adcq %r14, %r11
@ -10052,68 +10051,22 @@ L_curve25519_avx2_bits:
xorq %r11, 48(%rsp)
xorq %r12, 56(%rsp)
movq %rax, 184(%rsp)
# Sub
movq 64(%rsp), %r9
movq 72(%rsp), %r10
movq 80(%rsp), %r11
movq 88(%rsp), %r12
subq 32(%rsp), %r9
movq $0x00, %rax
sbbq 40(%rsp), %r10
movq $-19, %rcx
sbbq 48(%rsp), %r11
movq $0x7fffffffffffffff, %rbx
sbbq 56(%rsp), %r12
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r9
adcq %rax, %r10
adcq %rax, %r11
adcq %rbx, %r12
movq %r9, 96(%rsp)
movq %r10, 104(%rsp)
movq %r11, 112(%rsp)
movq %r12, 120(%rsp)
# Sub
movq (%rdi), %r9
movq 8(%rdi), %r10
movq 16(%rdi), %r11
movq 24(%rdi), %r12
subq (%rsp), %r9
movq $0x00, %rax
sbbq 8(%rsp), %r10
movq $-19, %rcx
sbbq 16(%rsp), %r11
movq $0x7fffffffffffffff, %rbx
sbbq 24(%rsp), %r12
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r9
adcq %rax, %r10
adcq %rax, %r11
adcq %rbx, %r12
movq %r9, 128(%rsp)
movq %r10, 136(%rsp)
movq %r11, 144(%rsp)
movq %r12, 152(%rsp)
# Add
movq (%rdi), %r9
movq 8(%rdi), %r10
addq (%rsp), %r9
movq 16(%rdi), %r11
adcq 8(%rsp), %r10
movq 24(%rdi), %rax
movq %r9, %r13
addq (%rsp), %r9
movq %r10, %r14
adcq 8(%rsp), %r10
movq %r11, %r15
adcq 16(%rsp), %r11
movq $-19, %rcx
movq %rax, %rbp
adcq 24(%rsp), %rax
movq $0x7fffffffffffffff, %rbx
movq $-19, %rcx
movq %rax, %r12
movq $0x7fffffffffffffff, %rbx
sarq $63, %rax
# Mask the modulus
andq %rax, %rcx
@ -10123,22 +10076,47 @@ L_curve25519_avx2_bits:
sbbq %rax, %r10
sbbq %rax, %r11
sbbq %rbx, %r12
# Sub
subq (%rsp), %r13
movq $0x00, %rax
sbbq 8(%rsp), %r14
movq $-19, %rcx
sbbq 16(%rsp), %r15
movq $0x7fffffffffffffff, %rbx
sbbq 24(%rsp), %rbp
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r13
adcq %rax, %r14
adcq %rax, %r15
adcq %rbx, %rbp
movq %r9, (%rdi)
movq %r10, 8(%rdi)
movq %r11, 16(%rdi)
movq %r12, 24(%rdi)
movq %r13, 128(%rsp)
movq %r14, 136(%rsp)
movq %r15, 144(%rsp)
movq %rbp, 152(%rsp)
# Add
movq 64(%rsp), %r9
movq 72(%rsp), %r10
addq 32(%rsp), %r9
movq 80(%rsp), %r11
adcq 40(%rsp), %r10
movq 88(%rsp), %rax
movq %r9, %r13
addq 32(%rsp), %r9
movq %r10, %r14
adcq 40(%rsp), %r10
movq %r11, %r15
adcq 48(%rsp), %r11
movq $-19, %rcx
movq %rax, %rbp
adcq 56(%rsp), %rax
movq $0x7fffffffffffffff, %rbx
movq $-19, %rcx
movq %rax, %r12
movq $0x7fffffffffffffff, %rbx
sarq $63, %rax
# Mask the modulus
andq %rax, %rcx
@ -10148,10 +10126,31 @@ L_curve25519_avx2_bits:
sbbq %rax, %r10
sbbq %rax, %r11
sbbq %rbx, %r12
# Sub
subq 32(%rsp), %r13
movq $0x00, %rax
sbbq 40(%rsp), %r14
movq $-19, %rcx
sbbq 48(%rsp), %r15
movq $0x7fffffffffffffff, %rbx
sbbq 56(%rsp), %rbp
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r13
adcq %rax, %r14
adcq %rax, %r15
adcq %rbx, %rbp
movq %r9, (%rsp)
movq %r10, 8(%rsp)
movq %r11, 16(%rsp)
movq %r12, 24(%rsp)
movq %r13, 96(%rsp)
movq %r14, 104(%rsp)
movq %r15, 112(%rsp)
movq %rbp, 120(%rsp)
# Multiply
# A[0] * B[0]
movq (%rdi), %rdx
@ -10607,15 +10606,19 @@ L_curve25519_avx2_bits:
# Add
movq 32(%rsp), %r9
movq 40(%rsp), %r10
addq (%rsp), %r9
movq 48(%rsp), %r11
adcq 8(%rsp), %r10
movq 56(%rsp), %rax
movq %r9, %r13
addq (%rsp), %r9
movq %r10, %r14
adcq 8(%rsp), %r10
movq %r11, %r15
adcq 16(%rsp), %r11
movq $-19, %rcx
movq %rax, %rbp
adcq 24(%rsp), %rax
movq $0x7fffffffffffffff, %rbx
movq $-19, %rcx
movq %rax, %r12
movq $0x7fffffffffffffff, %rbx
sarq $63, %rax
# Mask the modulus
andq %rax, %rcx
@ -10625,35 +10628,31 @@ L_curve25519_avx2_bits:
sbbq %rax, %r10
sbbq %rax, %r11
sbbq %rbx, %r12
movq %r9, 64(%rsp)
movq %r10, 72(%rsp)
movq %r11, 80(%rsp)
movq %r12, 88(%rsp)
# Sub
movq 32(%rsp), %r9
movq 40(%rsp), %r10
movq 48(%rsp), %r11
movq 56(%rsp), %r12
subq (%rsp), %r9
subq (%rsp), %r13
movq $0x00, %rax
sbbq 8(%rsp), %r10
sbbq 8(%rsp), %r14
movq $-19, %rcx
sbbq 16(%rsp), %r11
sbbq 16(%rsp), %r15
movq $0x7fffffffffffffff, %rbx
sbbq 24(%rsp), %r12
sbbq 24(%rsp), %rbp
sbbq $0x00, %rax
# Mask the modulus
andq %rax, %rcx
andq %rax, %rbx
# Add modulus (if underflow)
addq %rcx, %r9
adcq %rax, %r10
adcq %rax, %r11
adcq %rbx, %r12
movq %r9, (%rsp)
movq %r10, 8(%rsp)
movq %r11, 16(%rsp)
movq %r12, 24(%rsp)
addq %rcx, %r13
adcq %rax, %r14
adcq %rax, %r15
adcq %rbx, %rbp
movq %r9, 64(%rsp)
movq %r10, 72(%rsp)
movq %r11, 80(%rsp)
movq %r12, 88(%rsp)
movq %r13, (%rsp)
movq %r14, 8(%rsp)
movq %r15, 16(%rsp)
movq %rbp, 24(%rsp)
# Multiply
# A[0] * B[0]
movq 96(%rsp), %rdx