From 8cdebfaaee44f08128bc9b3450c874e1507fb850 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 22 Aug 2024 22:11:29 -0700 Subject: [PATCH 1/4] Use P-256 Montgomery inverse from s2n-bignum where applicable When the usual architectural constraints are met, the preprocessor macro EC_P256_USE_S2N_BIGNUM is set. That in turn triggers the replacement of the existing Fermat inverse in p256-nistz.c with the markedly faster and formally verified divstep-based code from s2n-bignum. --- crypto/fipsmodule/CMakeLists.txt | 4 + crypto/fipsmodule/ec/p256-nistz.c | 17 + crypto/fipsmodule/ec/p256-nistz.h | 7 + .../s2n-bignum/arm/p256/bignum_montinv_p256.S | 1303 +++++ .../s2n-bignum/arm/p256/p256_montjscalarmul.S | 5022 +++++++++++++++++ .../arm/p256/p256_montjscalarmul_alt.S | 3362 +++++++++++ .../s2n-bignum/include/s2n-bignum_aws-lc.h | 17 +- .../x86_att/p256/bignum_montinv_p256.S | 1633 ++++++ .../x86_att/p256/p256_montjscalarmul.S | 3550 ++++++++++++ .../x86_att/p256/p256_montjscalarmul_alt.S | 4707 +++++++++++++++ 10 files changed, 19620 insertions(+), 2 deletions(-) create mode 100644 third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S create mode 100644 third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S create mode 100644 third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S create mode 100644 third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S create mode 100644 third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S create mode 100644 third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index 220ef5d47b..cfc17f6971 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -196,6 +196,10 @@ if((((ARCH STREQUAL "x86_64") AND NOT MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX) OR set( S2N_BIGNUM_ASM_SOURCES + p256/bignum_montinv_p256.S + p256/p256_montjscalarmul_alt.S + p256/p256_montjscalarmul.S + p384/bignum_add_p384.S p384/bignum_sub_p384.S p384/bignum_neg_p384.S diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c index 23003e1e42..2c8a8cb40a 100644 --- a/crypto/fipsmodule/ec/p256-nistz.c +++ b/crypto/fipsmodule/ec/p256-nistz.c @@ -32,6 +32,10 @@ #include "internal.h" #include "p256-nistz.h" +#if defined(EC_P256_USE_S2N_BIGNUM) +# include "../../../third_party/s2n-bignum/include/s2n-bignum_aws-lc.h" +#endif + #if !defined(OPENSSL_NO_ASM) && \ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_SMALL) @@ -119,6 +123,16 @@ static BN_ULONG is_not_zero(BN_ULONG in) { // ecp_nistz256_mod_inverse_sqr_mont sets |r| to (|in| * 2^-256)^-2 * 2^256 mod // p. That is, |r| is the modular inverse square of |in| for input and output in // the Montgomery domain. + +#if defined(EC_P256_USE_S2N_BIGNUM) +static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS], + const BN_ULONG in[P256_LIMBS]) { + BN_ULONG z2[P256_LIMBS]; + ecp_nistz256_sqr_mont(z2,in); + bignum_montinv_p256(r,z2); +} + +#else static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS], const BN_ULONG in[P256_LIMBS]) { // This implements the addition chain described in @@ -185,6 +199,9 @@ static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS], ecp_nistz256_sqr_mont(r, ret); // 2^256 - 2^224 + 2^192 + 2^96 - 2^2 } +#endif + + // r = p * p_scalar static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r, const EC_JACOBIAN *p, diff --git a/crypto/fipsmodule/ec/p256-nistz.h b/crypto/fipsmodule/ec/p256-nistz.h index c61018bd21..6adb3ab258 100644 --- a/crypto/fipsmodule/ec/p256-nistz.h +++ b/crypto/fipsmodule/ec/p256-nistz.h @@ -29,6 +29,13 @@ extern "C" { #endif +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE)) && \ + ((defined(OPENSSL_X86_64) && \ + !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX)) || \ + defined(OPENSSL_AARCH64)) +#define EC_P256_USE_S2N_BIGNUM +#endif #if !defined(OPENSSL_NO_ASM) && \ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ diff --git a/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S new file mode 100644 index 0000000000..059f77e9af --- /dev/null +++ b/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S @@ -0,0 +1,1303 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_montinv_p256(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_256, i.e. is not divisible +// by it, returns z < p_256 such that x * z == 2^512 (mod p_256). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z +// (both mod p_256) then X * Z == 1 (mod p_256). That is, this function +// gives the analog of the modular inverse bignum_inv_p256 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_256, but the output always is. If the input +// is divisible (i.e. is 0 or p_256), then there can be no solution to +// the congruence x * z == 2^512 (mod p_256), and z = 0 is returned. +// +// Standard ARM ABI: X0 = z, X1 = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p256) + + .text + .balign 4 + +// Size in bytes of a 64-bit word + +#define N 8 + +// Used for the return pointer + +#define res x20 + +// Loop counter and d = 2 * delta value for divstep + +#define i x21 +#define d x22 + +// Registers used for matrix element magnitudes and signs + +#define m00 x10 +#define m01 x11 +#define m10 x12 +#define m11 x13 +#define s00 x14 +#define s01 x15 +#define s10 x16 +#define s11 x17 + +// Initial carries for combinations + +#define car0 x9 +#define car1 x19 + +// Input and output, plain registers treated according to pattern + +#define reg0 x0, #0 +#define reg1 x1, #0 +#define reg2 x2, #0 +#define reg3 x3, #0 +#define reg4 x4, #0 + +#define x x1, #0 +#define z x0, #0 + +// Pointer-offset pairs for temporaries on stack + +#define f sp, #0 +#define g sp, #(6*N) +#define u sp, #(12*N) +#define v sp, #(16*N) + +// Total size to reserve on the stack + +#define NSPACE #(20*N) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro. Takes input in +// [d4;d3;d2;d1;d0] and returns result in [d4;d3;d2;d1], adding to +// the existing [d4;d3;d2;d1], and re-using d0 as a temporary internally +// as well as t0, t1, t2. This is almost-Montgomery, i.e. the result fits +// in 4 digits but is not necessarily strictly reduced mod p_256. +// --------------------------------------------------------------------------- + +#define amontred(d4,d3,d2,d1,d0, t2,t1,t0) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_256. */ \ + mov t0, #0xe000000000000000; \ + adds d0, d0, t0; \ + sbcs d1, d1, xzr; \ + mov t1, #0x000000001fffffff; \ + adcs d2, d2, t1; \ + mov t2, #0x2000000000000000; \ + adcs d3, d3, t2; \ + mov t0, #0x1fffffffe0000000; \ + adc d4, d4, t0; \ +/* Let w = d0, the original word we use as offset; d0 gets recycled */ \ +/* First let [t2;t1] = 2^32 * w */ \ +/* then let [d0;t0] = (2^64 - 2^32 + 1) * w (overwrite old d0) */ \ + lsl t1, d0, #32; \ + subs t0, d0, t1; \ + lsr t2, d0, #32; \ + sbc d0, d0, t2; \ +/* Hence basic [d4;d3;d2;d1] += (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + adds d1, d1, t1; \ + adcs d2, d2, t2; \ + adcs d3, d3, t0; \ + adcs d4, d4, d0; \ +/* Now capture top carry and subtract p_256 if set (almost-Montgomery) */ \ + mov t0, #0xffffffffffffffff; \ + mov t1, #0x00000000ffffffff; \ + mov t2, #0xffffffff00000001; \ + csel t0, t0, xzr, cs; \ + csel t1, t1, xzr, cs; \ + csel t2, t2, xzr, cs; \ + subs d1, d1, t0; \ + sbcs d2, d2, t1; \ + sbcs d3, d3, xzr; \ + sbc d4, d4, t2 + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix in +// registers as follows +// +// [ m00 m01] +// [ m10 m11] + +#define divstep59() \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x8, x4, #0x100, lsl #12; \ + sbfx x8, x8, #21, #21; \ + mov x11, #0x100000; \ + add x11, x11, x11, lsl #21; \ + add x9, x4, x11; \ + asr x9, x9, #42; \ + add x10, x5, #0x100, lsl #12; \ + sbfx x10, x10, #21, #21; \ + add x11, x5, x11; \ + asr x11, x11, #42; \ + mul x6, x8, x2; \ + mul x7, x9, x3; \ + mul x2, x10, x2; \ + mul x3, x11, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #21, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #42; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #21, #21; \ + add x15, x5, x15; \ + asr x15, x15, #42; \ + mul x6, x12, x2; \ + mul x7, x13, x3; \ + mul x2, x14, x2; \ + mul x3, x15, x3; \ + add x4, x6, x7; \ + add x5, x2, x3; \ + asr x2, x4, #20; \ + asr x3, x5, #20; \ + and x4, x2, #0xfffff; \ + orr x4, x4, #0xfffffe0000000000; \ + and x5, x3, #0xfffff; \ + orr x5, x5, #0xc000000000000000; \ + tst x5, #0x1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + mul x2, x12, x8; \ + mul x3, x12, x9; \ + mul x6, x14, x8; \ + mul x7, x14, x9; \ + madd x8, x13, x10, x2; \ + madd x9, x13, x11, x3; \ + madd x16, x15, x10, x6; \ + madd x17, x15, x11, x7; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + tst x5, #0x2; \ + asr x5, x5, #1; \ + csel x6, x4, xzr, ne; \ + ccmp x1, xzr, #0x8, ne; \ + cneg x1, x1, ge; \ + cneg x6, x6, ge; \ + csel x4, x5, x4, ge; \ + add x5, x5, x6; \ + add x1, x1, #0x2; \ + asr x5, x5, #1; \ + add x12, x4, #0x100, lsl #12; \ + sbfx x12, x12, #22, #21; \ + mov x15, #0x100000; \ + add x15, x15, x15, lsl #21; \ + add x13, x4, x15; \ + asr x13, x13, #43; \ + add x14, x5, #0x100, lsl #12; \ + sbfx x14, x14, #22, #21; \ + add x15, x5, x15; \ + asr x15, x15, #43; \ + mneg x2, x12, x8; \ + mneg x3, x12, x9; \ + mneg x4, x14, x8; \ + mneg x5, x14, x9; \ + msub m00, x13, x16, x2; \ + msub m01, x13, x17, x3; \ + msub m10, x15, x16, x4; \ + msub m11, x15, x17, x5 + +S2N_BN_SYMBOL(bignum_montinv_p256): + +// Save registers and make room for temporaries + + stp x19, x20, [sp, -16]! + stp x21, x22, [sp, -16]! + stp x23, x24, [sp, -16]! + sub sp, sp, NSPACE + +// Save the return pointer for the end so we can overwrite x0 later + + mov res, x0 + +// Copy the prime and input into the main f and g variables respectively. +// Make sure x is reduced so that g <= f as assumed in the bound proof. + + mov x10, #0xffffffffffffffff + mov x11, #0x00000000ffffffff + mov x13, #0xffffffff00000001 + stp x10, x11, [f] + stp xzr, x13, [f+2*N] + str xzr, [f+4*N] + + ldp x2, x3, [x1] + subs x10, x2, x10 + sbcs x11, x3, x11 + ldp x4, x5, [x1, #(2*N)] + sbcs x12, x4, xzr + sbcs x13, x5, x13 + + csel x2, x2, x10, cc + csel x3, x3, x11, cc + csel x4, x4, x12, cc + csel x5, x5, x13, cc + + stp x2, x3, [g] + stp x4, x5, [g+2*N] + str xzr, [g+4*N] + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_256) +// starting with [p_256,x] == x * 2^{5*0-562} * [0,2^562] (mod p_256) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 10th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e. +// x * u == 2^512 as required. + + stp xzr, xzr, [u] + stp xzr, xzr, [u+2*N] + +// The starting constant 2^562 mod p_256 is +// 0x000bffffffebffff:fffbffffffefffff:ffe8000000000000:000c000000140000 +// where colons separate 64-bit subwords, least significant at the right. +// Only word number 1, value 0xffe8000000000000, is a single ARM move. + + mov x10, #0x0000000000140000 + orr x10, x10, #0x000c000000000000 + + mov x11, #0xffe8000000000000 + + movbig(x13, #0x000b, #0xffff, #0xffef, #0xffff) + orr x12, x13, #0xfff0000000000000 + and x13, x13, #0xfffffffffffbffff + + stp x10, x11, [v] + stp x12, x13, [v+2*N] + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + mov i, #10 + mov d, #1 + b midloop + +loop: + +// Separate the matrix elements into sign-magnitude pairs + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in stable registers for the [u,v] part and do [f,g] first. + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + + and x0, m10, s10 + and x1, m11, s11 + add car1, x0, x1 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + ldr x7, [f] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [g] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + adc x3, x3, x1 + +// Digit 1 of [f,g] + + ldr x7, [f+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [g+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + adc x6, x6, x1 + extr x4, x2, x4, #59 + str x4, [f] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + adc x4, x4, x1 + extr x5, x3, x5, #59 + str x5, [g] + +// Digit 2 of [f,g] + + ldr x7, [f+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [g+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + adc x5, x5, x1 + extr x2, x6, x2, #59 + str x2, [f+N] + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + adc x2, x2, x1 + extr x3, x4, x3, #59 + str x3, [g+N] + +// Digits 3 and 4 of [f,g] + + ldr x7, [f+3*N] + eor x1, x7, s00 + ldr x23, [f+4*N] + eor x3, x23, s00 + and x3, x3, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [g+3*N] + eor x1, x8, s01 + ldr x24, [g+4*N] + eor x0, x24, s01 + and x0, x0, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + extr x6, x5, x6, #59 + str x6, [f+2*N] + extr x5, x3, x5, #59 + str x5, [f+3*N] + asr x3, x3, #59 + str x3, [f+4*N] + + eor x1, x7, s10 + eor x5, x23, s10 + and x5, x5, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + eor x0, x24, s11 + and x0, x0, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + extr x4, x2, x4, #59 + str x4, [g+2*N] + extr x2, x5, x2, #59 + str x2, [g+3*N] + asr x5, x5, #59 + str x5, [g+4*N] + +// Now the computation of the updated u and v values and their +// Montgomery reductions. A very similar accumulation except that +// the top words of u and v are unsigned and we don't shift. +// +// Digit 0 of [u,v] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x5, car1, x0 + adc x3, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x5, x5, x0 + str x5, [v] + adc x3, x3, x1 + +// Digit 1 of [u,v] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x3, x3, x0 + adc x4, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x3, x3, x0 + str x3, [v+N] + adc x4, x4, x1 + +// Digit 2 of [u,v] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + + eor x1, x7, s10 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x4, x4, x0 + adc x2, xzr, x1 + eor x1, x8, s11 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x4, x4, x0 + str x4, [v+2*N] + adc x2, x2, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u + + ldp x0, x1, [u] + ldr x6, [u+2*N] + amontred(x3,x5,x6,x1,x0, x10,x11,x14) + stp x1, x6, [u] + stp x5, x3, [u+16] + +// Digits 3 and 4 of v (top is unsigned) + + eor x1, x7, s10 + and x5, s10, m10 + neg x5, x5 + mul x0, x1, m10 + umulh x1, x1, m10 + adds x2, x2, x0 + adc x5, x5, x1 + eor x1, x8, s11 + and x0, s11, m11 + sub x5, x5, x0 + mul x0, x1, m11 + umulh x1, x1, m11 + adds x2, x2, x0 + adc x5, x5, x1 + +// Montgomery reduction of v + + ldp x0, x1, [v] + ldr x3, [v+2*N] + amontred(x5,x2,x3,x1,x0, x10,x11,x14) + stp x1, x3, [v] + stp x2, x5, [v+16] + +midloop: + + mov x1, d + ldr x2, [f] + ldr x3, [g] + divstep59() + mov d, x1 + +// Next iteration + + subs i, i, #1 + bne loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + ldr x0, [f] + ldr x1, [g] + mul x0, x0, m00 + madd x1, x1, m01, x0 + asr x0, x1, #63 + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_256) +// we want to flip the sign of u according to that of f. + + cmp m00, xzr + csetm s00, mi + cneg m00, m00, mi + eor s00, s00, x0 + + cmp m01, xzr + csetm s01, mi + cneg m01, m01, mi + eor s01, s01, x0 + + cmp m10, xzr + csetm s10, mi + cneg m10, m10, mi + eor s10, s10, x0 + + cmp m11, xzr + csetm s11, mi + cneg m11, m11, mi + eor s11, s11, x0 + +// Adjust the initial value to allow for complement instead of negation + + and x0, m00, s00 + and x1, m01, s01 + add car0, x0, x1 + +// Digit 0 of [u] + + ldr x7, [u] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x4, car0, x0 + adc x2, xzr, x1 + ldr x8, [v] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x4, x4, x0 + str x4, [u] + adc x2, x2, x1 + +// Digit 1 of [u] + + ldr x7, [u+N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x2, x2, x0 + adc x6, xzr, x1 + ldr x8, [v+N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x2, x2, x0 + str x2, [u+N] + adc x6, x6, x1 + +// Digit 2 of [u] + + ldr x7, [u+2*N] + eor x1, x7, s00 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x6, x6, x0 + adc x5, xzr, x1 + ldr x8, [v+2*N] + eor x1, x8, s01 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x6, x6, x0 + str x6, [u+2*N] + adc x5, x5, x1 + +// Digits 3 and 4 of u (top is unsigned) + + ldr x7, [u+3*N] + eor x1, x7, s00 + and x3, s00, m00 + neg x3, x3 + mul x0, x1, m00 + umulh x1, x1, m00 + adds x5, x5, x0 + adc x3, x3, x1 + ldr x8, [v+3*N] + eor x1, x8, s01 + and x0, s01, m01 + sub x3, x3, x0 + mul x0, x1, m01 + umulh x1, x1, m01 + adds x5, x5, x0 + adc x3, x3, x1 + +// Montgomery reduction of u. This needs to be strict not "almost" +// so it is followed by an optional subtraction of p_256 + + ldp x0, x1, [u] + ldr x2, [u+2*N] + amontred(x3,x5,x2,x1,x0, x10,x11,x14) + + mov x10, #0xffffffffffffffff + subs x10, x1, x10 + mov x11, #0x00000000ffffffff + sbcs x11, x2, x11 + mov x13, #0xffffffff00000001 + sbcs x12, x5, xzr + sbcs x13, x3, x13 + + csel x10, x1, x10, cc + csel x11, x2, x11, cc + csel x12, x5, x12, cc + csel x13, x3, x13, cc + +// Store it back to the final output + + stp x10, x11, [res] + stp x12, x13, [res, #16] + +// Restore stack and registers + + add sp, sp, NSPACE + ldp x23, x24, [sp], 16 + ldp x21, x22, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S new file mode 100644 index 0000000000..23bc20971e --- /dev/null +++ b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S @@ -0,0 +1,5022 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for P-256 +// Input scalar[4], point[12]; output res[12] +// +// extern void p256_montjscalarmul +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version p256_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_256 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_256) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs (res lasts the whole code, point not so long) +// and additional values in variables, with some aliasing + +#define res x19 +#define sgn x20 +#define j x20 +#define point x21 + +// Intermediate variables on the stack. + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +#define NSPACE #(31*NUMSIZE) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p256_montjscalarmul): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + mov res, x0 + mov point, x2 + +// Load the digits of group order n_256 = [x12;x13;x14;x15] + + movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551) + movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84) + mov x14, #0xffffffffffffffff + mov x15, #0xffffffff00000000 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign as "sgn" so we can +// correspondingly negate the point below. + + subs x6, x12, x2 + sbcs x7, x13, x3 + sbcs x8, x14, x4 + sbc x9, x15, x5 + + tst x5, #0x8000000000000000 + csel x2, x2, x6, eq + csel x3, x3, x7, eq + csel x4, x4, x8, eq + csel x5, x5, x9, eq + cset sgn, ne + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + mov x6, 0x8888888888888888 + adds x2, x2, x6 + adcs x3, x3, x6 + bic x7, x6, #0xF000000000000000 + adcs x4, x4, x6 + adc x5, x5, x7 + + stp x2, x3, [scalarb] + stp x4, x5, [scalarb+16] + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_256 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + ldp x0, x1, [point] + stp x0, x1, [tab] + ldp x2, x3, [point, #16] + stp x2, x3, [tab+16] + + ldp x4, x5, [point, #32] + ldp x6, x7, [point, #48] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + orr x8, x4, x5 + orr x9, x6, x7 + orr x8, x8, x9 + cmp x8, xzr + ccmp sgn, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tab+32] + stp x6, x7, [tab+48] + + ldp x0, x1, [point, #64] + stp x0, x1, [tab+64] + ldp x2, x3, [point, #80] + stp x2, x3, [tab+80] + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + add x0, tab+96*1 + add x1, tab + bl local_p256_montjdouble + + add x0, tab+96*2 + add x1, tab+96*1 + add x2, tab + bl local_p256_montjadd + + add x0, tab+96*3 + add x1, tab+96*1 + bl local_p256_montjdouble + + add x0, tab+96*4 + add x1, tab+96*3 + add x2, tab + bl local_p256_montjadd + + add x0, tab+96*5 + add x1, tab+96*2 + bl local_p256_montjdouble + + add x0, tab+96*6 + add x1, tab+96*5 + add x2, tab + bl local_p256_montjadd + + add x0, tab+96*7 + add x1, tab+96*3 + bl local_p256_montjdouble + +// Initialize the accumulator as a table entry for top 4 bits (unrecoded) + + ldr x14, [scalarb+24] + lsr x14, x14, #60 + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + + mov j, #252 + +// Main loop over size-4 bitfields: double 4 times then add signed digit + +loop: + sub j, j, #4 + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + lsr x2, j, #6 + ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly + lsr x14, x14, j + and x14, x14, #15 + + subs x14, x14, #8 + cset x16, lo // x16 = sign of digit (1 = negative) + cneg x14, x14, lo // x14 = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_256 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + orr x12, x4, x5 + orr x13, x6, x7 + orr x12, x12, x13 + cmp x12, xzr + ccmp x16, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + + add x0, acc + add x1, acc + add x2, tabent + bl local_p256_montjadd + + cbnz j, loop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + ldp x0, x1, [acc] + stp x0, x1, [res] + ldp x0, x1, [acc+16] + stp x0, x1, [res, #16] + ldp x0, x1, [acc+32] + stp x0, x1, [res, #32] + ldp x0, x1, [acc+48] + stp x0, x1, [res, #48] + ldp x0, x1, [acc+64] + stp x0, x1, [res, #64] + ldp x0, x1, [acc+80] + stp x0, x1, [res, #80] + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +local_p256_montjadd: + stp x19, x20, [sp, #-16]! + stp x21, x22, [sp, #-16]! + stp x23, x24, [sp, #-16]! + stp x25, x26, [sp, #-16]! + stp x27, x30, [sp, #-16]! + sub sp, sp, #0xe0 + mov x21, x0 + mov x22, x1 + mov x23, x2 + mov x0, sp + ldr q19, [x22, #64] + ldp x9, x13, [x22, #64] + ldr q23, [x22, #80] + ldr q0, [x22, #64] + ldp x1, x10, [x22, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x19, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x20, x5, x2, cs + stp x14, x12, [x0, #16] + stp x19, x20, [x0] + ldr q19, [x23, #64] + ldp x9, x13, [x23, #64] + ldr q23, [x23, #80] + ldr q0, [x23, #64] + ldp x1, x10, [x23, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [sp, #176] + stp x16, x2, [sp, #160] + ldr q20, [x22, #32] + ldp x7, x17, [x23, #64] + ldr q0, [x23, #64] + ldp x6, x10, [x22, #32] + ldp x11, x15, [x23, #80] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x23, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #192] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #208] + ldr q20, [x23, #32] + ldp x7, x17, [x22, #64] + ldr q0, [x22, #64] + ldp x6, x10, [x23, #32] + ldp x11, x15, [x22, #80] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x23, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x22, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x24, x11, x13 + and x1, x1, x13 + adcs x25, x4, x1 + and x1, x12, x13 + stp x24, x25, [sp, #32] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #48] + mov x1, sp + ldr q20, [x23] + ldr q0, [x1] + ldp x6, x10, [x23] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x19, x20 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x20, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x19 + ldr q20, [x23, #16] + sbcs x5, x15, x20 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x19, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #64] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #80] + ldr q20, [x22] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x22] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #16] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #16] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + mov x1, sp + ldr q20, [sp, #32] + ldp x7, x17, [x1] + ldr q0, [x1] + ldp x11, x15, [x1, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x25 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x24 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x25, x24 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x1, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x24, x7 + sbcs x9, x25, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x24, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x25, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x24 + eor x1, x10, x5 + adcs x16, x2, x25 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #48] + ldr q20, [sp, #192] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [sp, #192] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #208] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #208] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x9, x11, x13 + and x1, x1, x13 + adcs x10, x4, x1 + and x1, x12, x13 + stp x9, x10, [sp, #192] + adcs x11, x7, xzr + adc x12, x17, x1 + stp x11, x12, [sp, #208] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x13, x5, x3 + and x4, x3, #0xffffffff + adcs x24, x6, x4 + adcs x25, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x26, x8, x4 + stp x13, x24, [sp, #160] + stp x25, x26, [sp, #176] + subs x5, x19, x9 + sbcs x6, x20, x10 + ldp x7, x8, [sp, #48] + sbcs x7, x7, x11 + sbcs x8, x8, x12 + csetm x3, cc + adds x19, x5, x3 + and x4, x3, #0xffffffff + adcs x20, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x19, x20, [sp, #32] + stp x7, x8, [sp, #48] + ldr q19, [sp, #160] + ldr q23, [sp, #176] + ldr q0, [sp, #160] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x13, x24 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x13, x24 + umulh x15, x13, x25 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x13, x24 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x26, x25 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x24, x26 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x25, x26 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x25, x26 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x26, x26 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x26, x26 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x25, x25 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x25, x25 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x24, x3, x16, cs + csel x25, x8, x14, cs + csel x26, x11, x12, cs + csel x27, x5, x2, cs + stp x25, x26, [sp, #112] + stp x24, x27, [sp, #96] + mov x0, sp + ldr q19, [sp, #32] + ldr q23, [sp, #48] + ldr q0, [sp, #32] + ldp x1, x10, [sp, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x19, x20 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x19, x20 + umulh x15, x19, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x19, x20 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x20, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [x0, #16] + stp x16, x2, [x0] + ldr q20, [sp, #128] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #128] + rev64 v16.4s, v20.4s + subs x4, x24, x27 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x27, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x25, x24 + ldr q20, [sp, #144] + sbcs x5, x26, x27 + ngc x17, xzr + subs x8, x25, x26 + uaddlp v27.2d, v16.4s + umulh x4, x24, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #144] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x25, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x26, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x25 + eor x1, x10, x5 + adcs x16, x2, x26 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + stp x19, x20, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + ldr q20, [sp, #64] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #64] + ldp x11, x15, [sp, #112] + rev64 v16.4s, v20.4s + subs x4, x24, x27 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x27, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x24 + ldr q20, [sp, #80] + sbcs x5, x15, x27 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x24, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x9, x11, x13 + and x1, x1, x13 + adcs x10, x4, x1 + and x1, x12, x13 + stp x9, x10, [sp, #64] + adcs x11, x7, xzr + adc x12, x17, x1 + stp x11, x12, [sp, #80] + mov x0, sp + mov x1, sp + ldp x5, x6, [x1] + subs x5, x5, x19 + sbcs x6, x6, x20 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x24, x5, x3 + and x4, x3, #0xffffffff + adcs x25, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x7, x8, [x0, #16] + subs x5, x9, x19 + sbcs x6, x10, x20 + ldp x4, x3, [sp, #144] + sbcs x7, x11, x4 + sbcs x8, x12, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldr q20, [x22, #64] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x22, #64] + ldp x11, x15, [sp, #176] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [x22, #80] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x22, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #160] + adcs x19, x7, xzr + adc x20, x17, x1 + stp x19, x20, [sp, #176] + mov x0, sp + mov x1, sp + ldp x4, x3, [sp, #64] + subs x5, x24, x4 + sbcs x6, x25, x3 + ldp x7, x8, [x1, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x9, x5, x3 + and x4, x3, #0xffffffff + adcs x10, x6, x4 + adcs x11, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x3, x8, x4 + stp x9, x10, [x0] + stp x11, x3, [x0, #16] + ldp x5, x6, [sp, #128] + subs x5, x5, x9 + sbcs x6, x6, x10 + ldp x7, x8, [sp, #144] + sbcs x7, x7, x11 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldr q20, [sp, #192] + ldp x7, x17, [sp, #96] + ldr q0, [sp, #96] + ldp x6, x10, [sp, #192] + ldp x11, x15, [sp, #112] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #208] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #112] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #208] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x11, x11, x13 + and x1, x1, x13 + adcs x4, x4, x1 + and x1, x12, x13 + stp x11, x4, [sp, #96] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #112] + ldr q20, [x23, #64] + ldp x7, x17, [sp, #160] + ldr q0, [sp, #160] + ldp x6, x10, [x23, #64] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x19, x7 + ldr q20, [x23, #80] + sbcs x5, x20, x17 + ngc x17, xzr + subs x8, x19, x20 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [x23, #80] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x19, x3, x13 + adcs x20, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x24, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x25, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x19 + adcs x15, x16, x20 + eor x5, x17, x4 + adcs x9, x1, x24 + eor x1, x10, x5 + adcs x16, x2, x25 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x19, x11, x13 + and x1, x1, x13 + adcs x20, x4, x1 + and x1, x12, x13 + stp x19, x20, [sp, #160] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #176] + ldr q20, [sp, #128] + ldp x7, x17, [sp, #32] + ldr q0, [sp, #32] + ldp x6, x10, [sp, #128] + ldp x11, x15, [sp, #48] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #144] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #48] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #144] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x24, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x24 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x5, x11, x13 + and x1, x1, x13 + adcs x6, x4, x1 + and x1, x12, x13 + adcs x7, x7, xzr + adc x9, x17, x1 + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x9, x3 + csetm x3, cc + adds x15, x5, x3 + and x4, x3, #0xffffffff + adcs x24, x6, x4 + adcs x25, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x26, x8, x4 + stp x15, x24, [sp, #128] + stp x25, x26, [sp, #144] + ldp x0, x1, [x22, #64] + ldp x2, x3, [x22, #80] + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + ldp x4, x5, [x23, #64] + ldp x6, x7, [x23, #80] + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + cmp x13, x12 + csel x8, x0, x19, cc + csel x9, x1, x20, cc + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [sp, #176] + csel x10, x2, x10, cc + csel x11, x3, x11, cc + csel x10, x6, x10, hi + csel x11, x7, x11, hi + ldp x12, x13, [x22] + ldp x0, x1, [sp] + csel x0, x12, x0, cc + csel x1, x13, x1, cc + ldp x12, x13, [x23] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + ldp x12, x13, [x22, #16] + ldp x2, x3, [sp, #16] + csel x2, x12, x2, cc + csel x3, x13, x3, cc + ldp x12, x13, [x23, #16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + ldp x12, x13, [x22, #32] + csel x4, x12, x15, cc + csel x5, x13, x24, cc + ldp x12, x13, [x23, #32] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + ldp x12, x13, [x22, #48] + csel x6, x12, x25, cc + csel x7, x13, x26, cc + ldp x12, x13, [x23, #48] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + stp x0, x1, [x21] + stp x2, x3, [x21, #16] + stp x4, x5, [x21, #32] + stp x6, x7, [x21, #48] + stp x8, x9, [x21, #64] + stp x10, x11, [x21, #80] + add sp, sp, #0xe0 + ldp x27, x30, [sp], #16 + ldp x25, x26, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x19, x20, [sp], #16 + ret + +local_p256_montjdouble: + sub sp, sp, #0x110 + stp x19, x20, [sp, #192] + stp x21, x22, [sp, #208] + stp x23, x24, [sp, #224] + stp x25, x26, [sp, #240] + stp x27, xzr, [sp, #256] + mov x19, x0 + mov x20, x1 + mov x0, sp + ldr q19, [x20, #64] + ldp x9, x13, [x20, #64] + ldr q23, [x20, #80] + ldr q0, [x20, #64] + ldp x1, x10, [x20, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x21, x3, x16, cs + csel x22, x8, x14, cs + csel x23, x11, x12, cs + csel x24, x5, x2, cs + stp x22, x23, [x0, #16] + stp x21, x24, [x0] + ldr q19, [x20, #32] + ldp x9, x13, [x20, #32] + ldr q23, [x20, #48] + ldr q0, [x20, #32] + ldp x1, x10, [x20, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x16, x3, x16, cs + csel x14, x8, x14, cs + csel x12, x11, x12, cs + csel x2, x5, x2, cs + stp x14, x12, [sp, #48] + stp x16, x2, [sp, #32] + ldp x5, x6, [x20] + subs x5, x5, x21 + sbcs x6, x6, x24 + ldp x7, x8, [x20, #16] + sbcs x7, x7, x22 + sbcs x8, x8, x23 + csetm x3, cc + adds x10, x5, x3 + and x4, x3, #0xffffffff + adcs x25, x6, x4 + adcs x26, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x27, x8, x4 + stp x10, x25, [sp, #96] + stp x26, x27, [sp, #112] + ldp x5, x6, [x20] + adds x5, x5, x21 + adcs x6, x6, x24 + ldp x7, x8, [x20, #16] + adcs x7, x7, x22 + adcs x8, x8, x23 + csetm x3, cs + subs x9, x5, x3 + and x1, x3, #0xffffffff + sbcs x5, x6, x1 + sbcs x7, x7, xzr + and x2, x3, #0xffffffff00000001 + sbc x8, x8, x2 + stp x9, x5, [sp, #64] + stp x7, x8, [sp, #80] + ldr q20, [sp, #96] + ldr q0, [sp, #64] + rev64 v16.4s, v20.4s + subs x4, x9, x5 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x5, x25 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x7, x9 + ldr q20, [sp, #112] + sbcs x5, x8, x5 + ngc x17, xzr + subs x8, x7, x8 + uaddlp v27.2d, v16.4s + umulh x4, x9, x10 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x25, x10 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #80] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x10, x26 + sbcs x9, x25, x27 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x27, x26 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x21, x3, x13 + adcs x22, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x23, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x24, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x21 + adcs x15, x16, x22 + eor x5, x17, x4 + adcs x9, x1, x23 + eor x1, x10, x5 + adcs x16, x2, x24 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x21, x11, x13 + and x1, x1, x13 + adcs x22, x4, x1 + and x1, x12, x13 + stp x21, x22, [sp, #96] + adcs x23, x7, xzr + adc x24, x17, x1 + stp x23, x24, [sp, #112] + ldp x4, x5, [x20, #32] + ldp x8, x9, [x20, #64] + adds x4, x4, x8 + adcs x5, x5, x9 + ldp x6, x7, [x20, #48] + ldp x10, x11, [x20, #80] + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x3, xzr, xzr + adds x8, x4, #0x1 + mov x9, #0xffffffff + sbcs x9, x5, x9 + sbcs x10, x6, xzr + mov x11, #0xffffffff00000001 + sbcs x11, x7, x11 + sbcs x3, x3, xzr + csel x4, x4, x8, cc + csel x5, x5, x9, cc + csel x6, x6, x10, cc + csel x7, x7, x11, cc + stp x4, x5, [sp, #64] + stp x6, x7, [sp, #80] + ldr q20, [sp, #32] + ldp x7, x17, [x20] + ldr q0, [x20] + ldp x6, x10, [sp, #32] + ldp x11, x15, [x20, #16] + rev64 v16.4s, v20.4s + subs x4, x7, x17 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x17, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x11, x7 + ldr q20, [sp, #48] + sbcs x5, x15, x17 + ngc x17, xzr + subs x8, x11, x15 + uaddlp v27.2d, v16.4s + umulh x4, x7, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [x20, #16] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #48] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x20, x3, x13 + adcs x25, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x26, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x27, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x20 + adcs x15, x16, x25 + eor x5, x17, x4 + adcs x9, x1, x26 + eor x1, x10, x5 + adcs x16, x2, x27 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x20, x11, x13 + and x1, x1, x13 + adcs x25, x4, x1 + and x1, x12, x13 + stp x20, x25, [sp, #128] + adcs x4, x7, xzr + adc x1, x17, x1 + stp x4, x1, [sp, #144] + ldr q19, [sp, #96] + ldr q23, [sp, #112] + ldr q0, [sp, #96] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x21, x22 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x21, x22 + umulh x15, x21, x23 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x21, x22 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x24, x23 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x22, x24 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x23, x24 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x23, x24 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x24, x24 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x24, x24 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x23, x23 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x23, x23 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x21, x3, x16, cs + csel x22, x8, x14, cs + csel x23, x11, x12, cs + csel x24, x5, x2, cs + ldr q19, [sp, #64] + ldp x9, x13, [sp, #64] + ldr q23, [sp, #80] + ldr q0, [sp, #64] + ldp x1, x10, [sp, #80] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x13, x3, x16, cs + csel x14, x8, x14, cs + csel x15, x11, x12, cs + csel x26, x5, x2, cs + mov x1, #0x9 + mov x2, #0xffffffffffffffff + subs x9, x2, x21 + mov x2, #0xffffffff + sbcs x10, x2, x24 + ngcs x11, x22 + mov x2, #0xffffffff00000001 + sbc x12, x2, x23 + mul x3, x1, x9 + mul x4, x1, x10 + mul x5, x1, x11 + mul x6, x1, x12 + umulh x9, x1, x9 + umulh x10, x1, x10 + umulh x11, x1, x11 + umulh x7, x1, x12 + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, xzr + mov x1, #0xc + mul x8, x20, x1 + umulh x9, x20, x1 + adds x3, x3, x8 + mul x8, x25, x1 + umulh x10, x25, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #144] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x20, x3, x8 + and x9, x8, #0xffffffff + adcs x21, x4, x9 + adcs x22, x5, xzr + neg x10, x9 + adc x23, x6, x10 + stp x20, x21, [sp, #160] + stp x22, x23, [sp, #176] + mov x2, sp + ldp x4, x3, [x2] + subs x5, x13, x4 + sbcs x6, x26, x3 + ldp x4, x3, [x2, #16] + sbcs x7, x14, x4 + sbcs x8, x15, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + mov x0, sp + ldr q19, [sp, #32] + ldp x9, x13, [sp, #32] + ldr q23, [sp, #48] + ldr q0, [sp, #32] + ldp x1, x10, [sp, #48] + uzp2 v29.4s, v19.4s, v19.4s + xtn v4.2s, v19.2d + umulh x8, x9, x13 + rev64 v20.4s, v23.4s + umull v16.2d, v19.2s, v19.2s + umull v1.2d, v29.2s, v4.2s + mul v20.4s, v20.4s, v0.4s + subs x14, x9, x13 + umulh x15, x9, x1 + mov x16, v16.d[1] + umull2 v4.2d, v19.4s, v19.4s + mov x4, v16.d[0] + uzp1 v17.4s, v23.4s, v0.4s + uaddlp v19.2d, v20.4s + lsr x7, x8, #63 + mul x11, x9, x13 + mov x12, v1.d[0] + csetm x5, cc + cneg x6, x14, cc + mov x3, v4.d[1] + mov x14, v4.d[0] + subs x2, x10, x1 + mov x9, v1.d[1] + cneg x17, x2, cc + cinv x2, x5, cc + adds x5, x4, x12, lsl #33 + extr x4, x8, x11, #63 + lsr x8, x12, #31 + uzp1 v20.4s, v0.4s, v0.4s + shl v19.2d, v19.2d, #32 + adc x16, x16, x8 + adds x8, x14, x9, lsl #33 + lsr x14, x9, #31 + lsl x9, x5, #32 + umlal v19.2d, v20.2s, v17.2s + adc x14, x3, x14 + adds x16, x16, x11, lsl #1 + lsr x3, x5, #32 + umulh x12, x6, x17 + adcs x4, x8, x4 + adc x11, x14, x7 + subs x8, x5, x9 + sbc x5, x5, x3 + adds x16, x16, x9 + mov x14, v19.d[0] + mul x17, x6, x17 + adcs x3, x4, x3 + lsl x7, x16, #32 + umulh x13, x13, x10 + adcs x11, x11, x8 + lsr x8, x16, #32 + adc x5, x5, xzr + subs x9, x16, x7 + sbc x16, x16, x8 + adds x7, x3, x7 + mov x3, v19.d[1] + adcs x6, x11, x8 + umulh x11, x1, x10 + adcs x5, x5, x9 + eor x8, x12, x2 + adc x9, x16, xzr + adds x16, x14, x15 + adc x15, x15, xzr + adds x12, x16, x3 + eor x16, x17, x2 + mul x4, x1, x10 + adcs x15, x15, x13 + adc x17, x13, xzr + adds x15, x15, x3 + adc x3, x17, xzr + cmn x2, #0x1 + mul x17, x10, x10 + adcs x12, x12, x16 + adcs x16, x15, x8 + umulh x10, x10, x10 + adc x2, x3, x2 + adds x14, x14, x14 + adcs x12, x12, x12 + adcs x16, x16, x16 + adcs x2, x2, x2 + adc x15, xzr, xzr + adds x14, x14, x7 + mul x3, x1, x1 + adcs x12, x12, x6 + lsr x7, x14, #32 + adcs x16, x16, x5 + lsl x5, x14, #32 + umulh x13, x1, x1 + adcs x2, x2, x9 + mov x6, #0xffffffff + adc x15, x15, xzr + adds x8, x4, x4 + adcs x1, x11, x11 + mov x11, #0xffffffff00000001 + adc x4, xzr, xzr + subs x9, x14, x5 + sbc x14, x14, x7 + adds x12, x12, x5 + adcs x16, x16, x7 + lsl x5, x12, #32 + lsr x7, x12, #32 + adcs x2, x2, x9 + adcs x14, x15, x14 + adc x15, xzr, xzr + subs x9, x12, x5 + sbc x12, x12, x7 + adds x16, x16, x5 + adcs x2, x2, x7 + adcs x14, x14, x9 + adcs x12, x15, x12 + adc x15, xzr, xzr + adds x16, x16, x3 + adcs x2, x2, x13 + adcs x14, x14, x17 + adcs x12, x12, x10 + adc x15, x15, xzr + adds x2, x2, x8 + adcs x14, x14, x1 + adcs x12, x12, x4 + adcs x15, x15, xzr + adds x3, x16, #0x1 + sbcs x5, x2, x6 + sbcs x8, x14, xzr + sbcs x11, x12, x11 + sbcs xzr, x15, xzr + csel x24, x3, x16, cs + csel x25, x8, x14, cs + csel x26, x11, x12, cs + csel x27, x5, x2, cs + stp x25, x26, [x0, #16] + stp x24, x27, [x0] + ldr q20, [sp, #96] + ldr q0, [sp, #160] + ldp x6, x10, [sp, #96] + rev64 v16.4s, v20.4s + subs x4, x20, x21 + csetm x3, cc + cneg x13, x4, cc + mul v16.4s, v16.4s, v0.4s + umulh x12, x21, x10 + uzp1 v28.4s, v20.4s, v0.4s + subs x14, x22, x20 + ldr q20, [sp, #112] + sbcs x5, x23, x21 + ngc x17, xzr + subs x8, x22, x23 + uaddlp v27.2d, v16.4s + umulh x4, x20, x6 + uzp1 v21.4s, v0.4s, v0.4s + cneg x11, x8, cc + shl v17.2d, v27.2d, #32 + csetm x15, cc + subs x9, x10, x6 + eor x7, x14, x17 + umlal v17.2d, v21.2s, v28.2s + cneg x8, x9, cc + cinv x9, x3, cc + cmn x17, #0x1 + ldr q28, [sp, #176] + adcs x14, x7, xzr + mul x7, x13, x8 + eor x1, x5, x17 + adcs x5, x1, xzr + xtn v1.2s, v20.2d + mov x1, v17.d[0] + mov x3, v17.d[1] + uzp2 v16.4s, v20.4s, v20.4s + umulh x16, x13, x8 + eor x13, x7, x9 + adds x8, x1, x3 + adcs x7, x4, x12 + xtn v0.2s, v28.2d + adcs x12, x12, xzr + adds x8, x4, x8 + adcs x3, x3, x7 + ldp x7, x2, [sp, #112] + adcs x12, x12, xzr + cmn x9, #0x1 + adcs x8, x8, x13 + eor x13, x16, x9 + adcs x16, x3, x13 + lsl x3, x1, #32 + adc x13, x12, x9 + subs x12, x6, x7 + sbcs x9, x10, x2 + lsr x10, x1, #32 + ngc x4, xzr + subs x6, x2, x7 + cinv x2, x15, cc + cneg x6, x6, cc + subs x7, x1, x3 + eor x9, x9, x4 + sbc x1, x1, x10 + adds x15, x8, x3 + adcs x3, x16, x10 + mul x16, x11, x6 + adcs x8, x13, x7 + eor x13, x12, x4 + adc x10, x1, xzr + cmn x4, #0x1 + umulh x6, x11, x6 + adcs x11, x13, xzr + adcs x1, x9, xzr + lsl x13, x15, #32 + subs x12, x15, x13 + lsr x7, x15, #32 + sbc x15, x15, x7 + adds x20, x3, x13 + adcs x21, x8, x7 + umulh x8, x14, x11 + umull v21.2d, v0.2s, v1.2s + adcs x22, x10, x12 + umull v3.2d, v0.2s, v16.2s + adc x23, x15, xzr + rev64 v24.4s, v20.4s + movi v2.2d, #0xffffffff + mul x10, x14, x11 + mul v4.4s, v24.4s, v28.4s + subs x13, x14, x5 + uzp2 v19.4s, v28.4s, v28.4s + csetm x15, cc + usra v3.2d, v21.2d, #32 + mul x7, x5, x1 + umull v21.2d, v19.2s, v16.2s + cneg x13, x13, cc + uaddlp v5.2d, v4.4s + subs x11, x1, x11 + and v16.16b, v3.16b, v2.16b + umulh x5, x5, x1 + shl v24.2d, v5.2d, #32 + cneg x11, x11, cc + umlal v16.2d, v19.2s, v1.2s + cinv x12, x15, cc + umlal v24.2d, v0.2s, v1.2s + adds x15, x10, x7 + mul x14, x13, x11 + eor x1, x6, x2 + adcs x6, x8, x5 + usra v21.2d, v3.2d, #32 + adcs x9, x5, xzr + umulh x11, x13, x11 + adds x15, x8, x15 + adcs x7, x7, x6 + eor x8, x14, x12 + usra v21.2d, v16.2d, #32 + adcs x13, x9, xzr + cmn x12, #0x1 + mov x9, v24.d[1] + adcs x14, x15, x8 + eor x6, x11, x12 + adcs x6, x7, x6 + mov x5, v24.d[0] + mov x11, v21.d[1] + mov x7, v21.d[0] + adc x3, x13, x12 + adds x12, x5, x9 + adcs x13, x7, x11 + adcs x11, x11, xzr + adds x12, x7, x12 + eor x16, x16, x2 + adcs x7, x9, x13 + adcs x11, x11, xzr + cmn x2, #0x1 + adcs x16, x12, x16 + adcs x1, x7, x1 + adc x2, x11, x2 + adds x7, x5, x20 + adcs x15, x16, x21 + eor x5, x17, x4 + adcs x9, x1, x22 + eor x1, x10, x5 + adcs x16, x2, x23 + adc x2, xzr, xzr + cmn x5, #0x1 + eor x13, x14, x5 + adcs x14, x1, x7 + eor x1, x6, x5 + adcs x6, x13, x15 + adcs x10, x1, x9 + eor x4, x3, x5 + mov x1, #0xffffffff + adcs x8, x4, x16 + lsr x13, x14, #32 + adcs x17, x2, x5 + adcs x11, x5, xzr + adc x4, x5, xzr + adds x12, x10, x7 + adcs x7, x8, x15 + adcs x5, x17, x9 + adcs x9, x11, x16 + lsl x11, x14, #32 + adc x10, x4, x2 + subs x17, x14, x11 + sbc x4, x14, x13 + adds x11, x6, x11 + adcs x12, x12, x13 + lsl x15, x11, #32 + adcs x17, x7, x17 + lsr x7, x11, #32 + adc x13, x4, xzr + subs x4, x11, x15 + sbc x11, x11, x7 + adds x8, x12, x15 + adcs x15, x17, x7 + adcs x4, x13, x4 + adc x11, x11, xzr + adds x7, x5, x4 + adcs x17, x9, x11 + adc x13, x10, xzr + add x12, x13, #0x1 + neg x11, x12 + lsl x4, x12, #32 + adds x17, x17, x4 + sub x4, x4, #0x1 + adc x13, x13, xzr + subs x11, x8, x11 + sbcs x4, x15, x4 + sbcs x7, x7, xzr + sbcs x17, x17, x12 + sbcs x13, x13, x12 + mov x12, #0xffffffff00000001 + adds x14, x11, x13 + and x1, x1, x13 + adcs x15, x4, x1 + and x1, x12, x13 + stp x14, x15, [sp, #96] + adcs x13, x7, xzr + adc x20, x17, x1 + stp x13, x20, [sp, #112] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [x19, #64] + stp x7, x8, [x19, #80] + ldp x1, x2, [sp, #128] + lsl x0, x1, #2 + ldp x6, x7, [sp, #160] + subs x0, x0, x6 + extr x1, x2, x1, #62 + sbcs x1, x1, x7 + ldp x3, x4, [sp, #144] + extr x2, x3, x2, #62 + ldp x6, x7, [sp, #176] + sbcs x2, x2, x6 + extr x3, x4, x3, #62 + sbcs x3, x3, x7 + lsr x4, x4, #62 + sbc x4, x4, xzr + add x5, x4, #0x1 + lsl x8, x5, #32 + negs x6, x8 + ngcs x7, xzr + sbc x8, x8, x5 + adds x0, x0, x5 + adcs x1, x1, x6 + adcs x2, x2, x7 + adcs x3, x3, x8 + csetm x5, cc + adds x0, x0, x5 + and x6, x5, #0xffffffff + adcs x1, x1, x6 + adcs x2, x2, xzr + neg x7, x6 + adc x3, x3, x7 + stp x0, x1, [x19] + stp x2, x3, [x19, #16] + mov x2, #0xffffffffffffffff + subs x9, x2, x24 + mov x2, #0xffffffff + sbcs x10, x2, x27 + ngcs x11, x25 + mov x2, #0xffffffff00000001 + sbc x12, x2, x26 + lsl x3, x9, #3 + extr x4, x10, x9, #61 + extr x5, x11, x10, #61 + extr x6, x12, x11, #61 + lsr x7, x12, #61 + mov x1, #0x3 + mul x8, x14, x1 + umulh x9, x14, x1 + adds x3, x3, x8 + mul x8, x15, x1 + umulh x10, x15, x1 + adcs x4, x4, x8 + mul x8, x13, x1 + umulh x11, x13, x1 + adcs x5, x5, x8 + mul x8, x20, x1 + umulh x12, x20, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x3, x3, x8 + and x9, x8, #0xffffffff + adcs x4, x4, x9 + adcs x5, x5, xzr + neg x10, x9 + adc x6, x6, x10 + stp x3, x4, [x19, #32] + stp x5, x6, [x19, #48] + ldp x27, xzr, [sp, #256] + ldp x25, x26, [sp, #240] + ldp x23, x24, [sp, #224] + ldp x21, x22, [sp, #208] + ldp x19, x20, [sp, #192] + add sp, sp, #0x110 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S new file mode 100644 index 0000000000..0e453f5bae --- /dev/null +++ b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S @@ -0,0 +1,3362 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for P-256 +// Input scalar[4], point[12]; output res[12] +// +// extern void p256_montjscalarmul_alt +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version p256_scalarmul_alt. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_256 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_256) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard ARM ABI: X0 = res, X1 = scalar, X2 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjscalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Safe copies of inputs (res lasts the whole code, point not so long) +// and additional values in variables, with some aliasing + +#define res x19 +#define sgn x20 +#define j x20 +#define point x21 + +// Intermediate variables on the stack. + +#define scalarb sp, #(0*NUMSIZE) +#define acc sp, #(1*NUMSIZE) +#define tabent sp, #(4*NUMSIZE) + +#define tab sp, #(7*NUMSIZE) + +#define NSPACE #(31*NUMSIZE) + +// Loading large constants + +#define movbig(nn,n3,n2,n1,n0) \ + movz nn, n0; \ + movk nn, n1, lsl #16; \ + movk nn, n2, lsl #32; \ + movk nn, n3, lsl #48 + +S2N_BN_SYMBOL(p256_montjscalarmul_alt): + + stp x19, x20, [sp, #-16]! + stp x21, x30, [sp, #-16]! + sub sp, sp, NSPACE + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + mov res, x0 + mov point, x2 + +// Load the digits of group order n_256 = [x12;x13;x14;x15] + + movbig(x12, #0xf3b9, #0xcac2, #0xfc63, #0x2551) + movbig(x13, #0xbce6, #0xfaad, #0xa717, #0x9e84) + mov x14, #0xffffffffffffffff + mov x15, #0xffffffff00000000 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + ldp x2, x3, [x1] + ldp x4, x5, [x1, #16] + + subs x6, x2, x12 + sbcs x7, x3, x13 + sbcs x8, x4, x14 + sbcs x9, x5, x15 + + csel x2, x2, x6, cc + csel x3, x3, x7, cc + csel x4, x4, x8, cc + csel x5, x5, x9, cc + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign as "sgn" so we can +// correspondingly negate the point below. + + subs x6, x12, x2 + sbcs x7, x13, x3 + sbcs x8, x14, x4 + sbc x9, x15, x5 + + tst x5, #0x8000000000000000 + csel x2, x2, x6, eq + csel x3, x3, x7, eq + csel x4, x4, x8, eq + csel x5, x5, x9, eq + cset sgn, ne + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + mov x6, 0x8888888888888888 + adds x2, x2, x6 + adcs x3, x3, x6 + bic x7, x6, #0xF000000000000000 + adcs x4, x4, x6 + adc x5, x5, x7 + + stp x2, x3, [scalarb] + stp x4, x5, [scalarb+16] + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_256 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + ldp x0, x1, [point] + stp x0, x1, [tab] + ldp x2, x3, [point, #16] + stp x2, x3, [tab+16] + + ldp x4, x5, [point, #32] + ldp x6, x7, [point, #48] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + orr x8, x4, x5 + orr x9, x6, x7 + orr x8, x8, x9 + cmp x8, xzr + ccmp sgn, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tab+32] + stp x6, x7, [tab+48] + + ldp x0, x1, [point, #64] + stp x0, x1, [tab+64] + ldp x2, x3, [point, #80] + stp x2, x3, [tab+80] + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + add x0, tab+96*1 + add x1, tab + bl local_p256_montjdouble + + add x0, tab+96*2 + add x1, tab+96*1 + add x2, tab + bl local_p256_montjadd + + add x0, tab+96*3 + add x1, tab+96*1 + bl local_p256_montjdouble + + add x0, tab+96*4 + add x1, tab+96*3 + add x2, tab + bl local_p256_montjadd + + add x0, tab+96*5 + add x1, tab+96*2 + bl local_p256_montjdouble + + add x0, tab+96*6 + add x1, tab+96*5 + add x2, tab + bl local_p256_montjadd + + add x0, tab+96*7 + add x1, tab+96*3 + bl local_p256_montjdouble + +// Initialize the accumulator as a table entry for top 4 bits (unrecoded) + + ldr x14, [scalarb+24] + lsr x14, x14, #60 + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + stp x0, x1, [acc] + stp x2, x3, [acc+16] + stp x4, x5, [acc+32] + stp x6, x7, [acc+48] + stp x8, x9, [acc+64] + stp x10, x11, [acc+80] + + mov j, #252 + +// Main loop over size-4 bitfields: double 4 times then add signed digit + +loop: + sub j, j, #4 + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + add x0, acc + add x1, acc + bl local_p256_montjdouble + + lsr x2, j, #6 + ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly + lsr x14, x14, j + and x14, x14, #15 + + subs x14, x14, #8 + cset x16, lo // x16 = sign of digit (1 = negative) + cneg x14, x14, lo // x14 = absolute value of digit + +// Conditionally select the table entry tab[i-1] = i * P in constant time + + mov x0, xzr + mov x1, xzr + mov x2, xzr + mov x3, xzr + mov x4, xzr + mov x5, xzr + mov x6, xzr + mov x7, xzr + mov x8, xzr + mov x9, xzr + mov x10, xzr + mov x11, xzr + add x15, tab + .set i, 1 +.rep 8 + cmp x14, #i + ldp x12, x13, [x15] + csel x0, x12, x0, eq + csel x1, x13, x1, eq + ldp x12, x13, [x15, #16] + csel x2, x12, x2, eq + csel x3, x13, x3, eq + ldp x12, x13, [x15, #32] + csel x4, x12, x4, eq + csel x5, x13, x5, eq + ldp x12, x13, [x15, #48] + csel x6, x12, x6, eq + csel x7, x13, x7, eq + ldp x12, x13, [x15, #64] + csel x8, x12, x8, eq + csel x9, x13, x9, eq + ldp x12, x13, [x15, #80] + csel x10, x12, x10, eq + csel x11, x13, x11, eq + add x15, x15, #96 + .set i, (i+1) +.endr + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_256 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + stp x0, x1, [tabent] + stp x2, x3, [tabent+16] + + mov x0, 0xffffffffffffffff + subs x0, x0, x4 + mov x1, 0x00000000ffffffff + sbcs x1, x1, x5 + mov x3, 0xffffffff00000001 + sbcs x2, xzr, x6 + sbc x3, x3, x7 + + orr x12, x4, x5 + orr x13, x6, x7 + orr x12, x12, x13 + cmp x12, xzr + ccmp x16, xzr, #4, ne + csel x4, x0, x4, ne + csel x5, x1, x5, ne + csel x6, x2, x6, ne + csel x7, x3, x7, ne + + stp x4, x5, [tabent+32] + stp x6, x7, [tabent+48] + stp x8, x9, [tabent+64] + stp x10, x11, [tabent+80] + + add x0, acc + add x1, acc + add x2, tabent + bl local_p256_montjadd + + cbnz j, loop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + ldp x0, x1, [acc] + stp x0, x1, [res] + ldp x0, x1, [acc+16] + stp x0, x1, [res, #16] + ldp x0, x1, [acc+32] + stp x0, x1, [res, #32] + ldp x0, x1, [acc+48] + stp x0, x1, [res, #48] + ldp x0, x1, [acc+64] + stp x0, x1, [res, #64] + ldp x0, x1, [acc+80] + stp x0, x1, [res, #80] + +// Restore stack and registers and return + + add sp, sp, NSPACE + ldp x21, x30, [sp], 16 + ldp x19, x20, [sp], 16 + ret + +// Local copies of subroutines, complete clones at the moment + +local_p256_montjadd: + sub sp, sp, #0xe0 + mov x15, x0 + mov x16, x1 + mov x17, x2 + ldp x2, x3, [x16, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x2, x3, [x17, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x17, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp, #160] + stp x10, x11, [sp, #176] + ldp x3, x4, [x17, #64] + ldp x7, x8, [x16, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x17, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #192] + stp x14, x0, [sp, #208] + ldp x3, x4, [x16, #64] + ldp x7, x8, [x17, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x3, x4, [sp] + ldp x7, x8, [x17] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x16] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #16] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x3, x4, [sp] + ldp x7, x8, [sp, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #32] + stp x14, x0, [sp, #48] + ldp x3, x4, [sp, #160] + ldp x7, x8, [sp, #192] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #208] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #192] + stp x14, x0, [sp, #208] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #160] + stp x7, x8, [sp, #176] + ldp x5, x6, [sp, #32] + ldp x4, x3, [sp, #192] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #48] + ldp x4, x3, [sp, #208] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #32] + stp x7, x8, [sp, #48] + ldp x2, x3, [sp, #160] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #176] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + mov x2, #0xffffffffffffffff + csel x2, xzr, x2, cc + mov x3, #0xffffffff + csel x3, xzr, x3, cc + mov x5, #0xffffffff00000001 + csel x5, xzr, x5, cc + subs x8, x8, x2 + sbcs x9, x9, x3 + sbcs x10, x10, xzr + sbc x11, x11, x5 + stp x8, x9, [sp, #96] + stp x10, x11, [sp, #112] + ldp x2, x3, [sp, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mov x3, #0xffffffff00000001 + mul x2, x8, x3 + umulh x8, x8, x3 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mov x3, #0xffffffff00000001 + mul x2, x9, x3 + umulh x9, x9, x3 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mov x3, #0xffffffff00000001 + mul x2, x10, x3 + umulh x10, x10, x3 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mov x3, #0xffffffff00000001 + mul x2, x11, x3 + umulh x11, x11, x3 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + mov x5, #0xffffffff00000001 + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #128] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #144] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #64] + stp x14, x0, [sp, #80] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #128] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #144] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x16, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x16, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #160] + stp x14, x0, [sp, #176] + ldp x5, x6, [sp] + ldp x4, x3, [sp, #64] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #16] + ldp x4, x3, [sp, #80] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp] + stp x7, x8, [sp, #16] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x3, x4, [sp, #96] + ldp x7, x8, [sp, #192] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #208] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #112] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x3, x4, [sp, #160] + ldp x7, x8, [x17, #64] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [x17, #80] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #160] + stp x14, x0, [sp, #176] + ldp x3, x4, [sp, #32] + ldp x7, x8, [sp, #128] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #144] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #48] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x5, x6, [sp, #128] + ldp x4, x3, [sp, #96] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #144] + ldp x4, x3, [sp, #112] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + mov x4, #0xffffffff + and x4, x4, x3 + adcs x6, x6, x4 + adcs x7, x7, xzr + mov x4, #0xffffffff00000001 + and x4, x4, x3 + adc x8, x8, x4 + stp x5, x6, [sp, #128] + stp x7, x8, [sp, #144] + ldp x0, x1, [x16, #64] + ldp x2, x3, [x16, #80] + orr x12, x0, x1 + orr x13, x2, x3 + orr x12, x12, x13 + cmp x12, xzr + cset x12, ne + ldp x4, x5, [x17, #64] + ldp x6, x7, [x17, #80] + orr x13, x4, x5 + orr x14, x6, x7 + orr x13, x13, x14 + cmp x13, xzr + cset x13, ne + cmp x13, x12 + ldp x8, x9, [sp, #160] + csel x8, x0, x8, cc + csel x9, x1, x9, cc + csel x8, x4, x8, hi + csel x9, x5, x9, hi + ldp x10, x11, [sp, #176] + csel x10, x2, x10, cc + csel x11, x3, x11, cc + csel x10, x6, x10, hi + csel x11, x7, x11, hi + ldp x12, x13, [x16] + ldp x0, x1, [sp] + csel x0, x12, x0, cc + csel x1, x13, x1, cc + ldp x12, x13, [x17] + csel x0, x12, x0, hi + csel x1, x13, x1, hi + ldp x12, x13, [x16, #16] + ldp x2, x3, [sp, #16] + csel x2, x12, x2, cc + csel x3, x13, x3, cc + ldp x12, x13, [x17, #16] + csel x2, x12, x2, hi + csel x3, x13, x3, hi + ldp x12, x13, [x16, #32] + ldp x4, x5, [sp, #128] + csel x4, x12, x4, cc + csel x5, x13, x5, cc + ldp x12, x13, [x17, #32] + csel x4, x12, x4, hi + csel x5, x13, x5, hi + ldp x12, x13, [x16, #48] + ldp x6, x7, [sp, #144] + csel x6, x12, x6, cc + csel x7, x13, x7, cc + ldp x12, x13, [x17, #48] + csel x6, x12, x6, hi + csel x7, x13, x7, hi + stp x0, x1, [x15] + stp x2, x3, [x15, #16] + stp x4, x5, [x15, #32] + stp x6, x7, [x15, #48] + stp x8, x9, [x15, #64] + stp x10, x11, [x15, #80] + add sp, sp, #0xe0 + ret + +local_p256_montjdouble: + sub sp, sp, #0xc0 + mov x15, x0 + mov x16, x1 + ldp x2, x3, [x16, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x2, x3, [x16, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [x16, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp, #32] + stp x10, x11, [sp, #48] + ldp x5, x6, [x16] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [x16, #16] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #96] + stp x7, x8, [sp, #112] + ldp x5, x6, [x16] + ldp x4, x3, [sp] + adds x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x16, #16] + ldp x4, x3, [sp, #16] + adcs x7, x7, x4 + adcs x8, x8, x3 + csetm x3, cs + subs x5, x5, x3 + and x1, x3, #0xffffffff + sbcs x6, x6, x1 + sbcs x7, x7, xzr + and x2, x3, #0xffffffff00000001 + sbc x8, x8, x2 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + ldp x3, x4, [sp, #64] + ldp x7, x8, [sp, #96] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #112] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #80] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x5, x6, [x16, #32] + ldp x4, x3, [x16, #64] + adds x5, x5, x4 + adcs x6, x6, x3 + ldp x7, x8, [x16, #48] + ldp x4, x3, [x16, #80] + adcs x7, x7, x4 + adcs x8, x8, x3 + adc x3, xzr, xzr + cmn x5, #0x1 + mov x4, #0xffffffff + sbcs xzr, x6, x4 + sbcs xzr, x7, xzr + mov x4, #0xffffffff00000001 + sbcs xzr, x8, x4 + adcs x3, x3, xzr + csetm x3, ne + subs x5, x5, x3 + and x4, x3, #0xffffffff + sbcs x6, x6, x4 + sbcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + sbc x8, x8, x4 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + ldp x3, x4, [x16] + ldp x7, x8, [sp, #32] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #48] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [x16, #16] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #128] + stp x14, x0, [sp, #144] + ldp x2, x3, [sp, #96] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #112] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp, #160] + stp x10, x11, [sp, #176] + ldp x2, x3, [sp, #64] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #80] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp, #64] + stp x10, x11, [sp, #80] + mov x1, #0x9 + mov x2, #0xffffffffffffffff + ldp x9, x10, [sp, #160] + subs x9, x2, x9 + mov x2, #0xffffffff + sbcs x10, x2, x10 + ldp x11, x12, [sp, #176] + ngcs x11, x11 + mov x2, #0xffffffff00000001 + sbc x12, x2, x12 + mul x3, x1, x9 + mul x4, x1, x10 + mul x5, x1, x11 + mul x6, x1, x12 + umulh x9, x1, x9 + umulh x10, x1, x10 + umulh x11, x1, x11 + umulh x7, x1, x12 + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, xzr + mov x1, #0xc + ldp x9, x10, [sp, #128] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #144] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x3, x3, x8 + and x9, x8, #0xffffffff + adcs x4, x4, x9 + adcs x5, x5, xzr + neg x10, x9 + adc x6, x6, x10 + stp x3, x4, [sp, #160] + stp x5, x6, [sp, #176] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #16] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [sp, #64] + stp x7, x8, [sp, #80] + ldp x2, x3, [sp, #32] + mul x9, x2, x3 + umulh x10, x2, x3 + ldp x4, x5, [sp, #48] + mul x11, x2, x5 + umulh x12, x2, x5 + mul x6, x2, x4 + umulh x7, x2, x4 + adds x10, x10, x6 + adcs x11, x11, x7 + mul x6, x3, x4 + umulh x7, x3, x4 + adc x7, x7, xzr + adds x11, x11, x6 + mul x13, x4, x5 + umulh x14, x4, x5 + adcs x12, x12, x7 + mul x6, x3, x5 + umulh x7, x3, x5 + adc x7, x7, xzr + adds x12, x12, x6 + adcs x13, x13, x7 + adc x14, x14, xzr + adds x9, x9, x9 + adcs x10, x10, x10 + adcs x11, x11, x11 + adcs x12, x12, x12 + adcs x13, x13, x13 + adcs x14, x14, x14 + cset x7, cs + umulh x6, x2, x2 + mul x8, x2, x2 + adds x9, x9, x6 + mul x6, x3, x3 + adcs x10, x10, x6 + umulh x6, x3, x3 + adcs x11, x11, x6 + mul x6, x4, x4 + adcs x12, x12, x6 + umulh x6, x4, x4 + adcs x13, x13, x6 + mul x6, x5, x5 + adcs x14, x14, x6 + umulh x6, x5, x5 + adc x7, x7, x6 + mov x5, #0xffffffff00000001 + adds x9, x9, x8, lsl #32 + lsr x3, x8, #32 + adcs x10, x10, x3 + mul x2, x8, x5 + umulh x8, x8, x5 + adcs x11, x11, x2 + adc x8, x8, xzr + adds x10, x10, x9, lsl #32 + lsr x3, x9, #32 + adcs x11, x11, x3 + mul x2, x9, x5 + umulh x9, x9, x5 + adcs x8, x8, x2 + adc x9, x9, xzr + adds x11, x11, x10, lsl #32 + lsr x3, x10, #32 + adcs x8, x8, x3 + mul x2, x10, x5 + umulh x10, x10, x5 + adcs x9, x9, x2 + adc x10, x10, xzr + adds x8, x8, x11, lsl #32 + lsr x3, x11, #32 + adcs x9, x9, x3 + mul x2, x11, x5 + umulh x11, x11, x5 + adcs x10, x10, x2 + adc x11, x11, xzr + adds x8, x8, x12 + adcs x9, x9, x13 + adcs x10, x10, x14 + adcs x11, x11, x7 + cset x2, cs + mov x3, #0xffffffff + adds x12, x8, #0x1 + sbcs x13, x9, x3 + sbcs x14, x10, xzr + sbcs x7, x11, x5 + sbcs xzr, x2, xzr + csel x8, x8, x12, cc + csel x9, x9, x13, cc + csel x10, x10, x14, cc + csel x11, x11, x7, cc + stp x8, x9, [sp] + stp x10, x11, [sp, #16] + ldp x3, x4, [sp, #160] + ldp x7, x8, [sp, #96] + mul x12, x3, x7 + umulh x13, x3, x7 + mul x11, x3, x8 + umulh x14, x3, x8 + adds x13, x13, x11 + ldp x9, x10, [sp, #112] + mul x11, x3, x9 + umulh x0, x3, x9 + adcs x14, x14, x11 + mul x11, x3, x10 + umulh x1, x3, x10 + adcs x0, x0, x11 + adc x1, x1, xzr + ldp x5, x6, [sp, #176] + mul x11, x4, x7 + adds x13, x13, x11 + mul x11, x4, x8 + adcs x14, x14, x11 + mul x11, x4, x9 + adcs x0, x0, x11 + mul x11, x4, x10 + adcs x1, x1, x11 + umulh x3, x4, x10 + adc x3, x3, xzr + umulh x11, x4, x7 + adds x14, x14, x11 + umulh x11, x4, x8 + adcs x0, x0, x11 + umulh x11, x4, x9 + adcs x1, x1, x11 + adc x3, x3, xzr + mul x11, x5, x7 + adds x14, x14, x11 + mul x11, x5, x8 + adcs x0, x0, x11 + mul x11, x5, x9 + adcs x1, x1, x11 + mul x11, x5, x10 + adcs x3, x3, x11 + umulh x4, x5, x10 + adc x4, x4, xzr + umulh x11, x5, x7 + adds x0, x0, x11 + umulh x11, x5, x8 + adcs x1, x1, x11 + umulh x11, x5, x9 + adcs x3, x3, x11 + adc x4, x4, xzr + mul x11, x6, x7 + adds x0, x0, x11 + mul x11, x6, x8 + adcs x1, x1, x11 + mul x11, x6, x9 + adcs x3, x3, x11 + mul x11, x6, x10 + adcs x4, x4, x11 + umulh x5, x6, x10 + adc x5, x5, xzr + mov x10, #0xffffffff00000001 + adds x13, x13, x12, lsl #32 + lsr x11, x12, #32 + adcs x14, x14, x11 + mul x11, x12, x10 + umulh x12, x12, x10 + adcs x0, x0, x11 + adc x12, x12, xzr + umulh x11, x6, x7 + adds x1, x1, x11 + umulh x11, x6, x8 + adcs x3, x3, x11 + umulh x11, x6, x9 + adcs x4, x4, x11 + adc x5, x5, xzr + adds x14, x14, x13, lsl #32 + lsr x11, x13, #32 + adcs x0, x0, x11 + mul x11, x13, x10 + umulh x13, x13, x10 + adcs x12, x12, x11 + adc x13, x13, xzr + adds x0, x0, x14, lsl #32 + lsr x11, x14, #32 + adcs x12, x12, x11 + mul x11, x14, x10 + umulh x14, x14, x10 + adcs x13, x13, x11 + adc x14, x14, xzr + adds x12, x12, x0, lsl #32 + lsr x11, x0, #32 + adcs x13, x13, x11 + mul x11, x0, x10 + umulh x0, x0, x10 + adcs x14, x14, x11 + adc x0, x0, xzr + adds x12, x12, x1 + adcs x13, x13, x3 + adcs x14, x14, x4 + adcs x0, x0, x5 + cset x8, cs + mov x11, #0xffffffff + adds x1, x12, #0x1 + sbcs x3, x13, x11 + sbcs x4, x14, xzr + sbcs x5, x0, x10 + sbcs xzr, x8, xzr + csel x12, x12, x1, cc + csel x13, x13, x3, cc + csel x14, x14, x4, cc + csel x0, x0, x5, cc + stp x12, x13, [sp, #96] + stp x14, x0, [sp, #112] + ldp x5, x6, [sp, #64] + ldp x4, x3, [sp, #32] + subs x5, x5, x4 + sbcs x6, x6, x3 + ldp x7, x8, [sp, #80] + ldp x4, x3, [sp, #48] + sbcs x7, x7, x4 + sbcs x8, x8, x3 + csetm x3, cc + adds x5, x5, x3 + and x4, x3, #0xffffffff + adcs x6, x6, x4 + adcs x7, x7, xzr + and x4, x3, #0xffffffff00000001 + adc x8, x8, x4 + stp x5, x6, [x15, #64] + stp x7, x8, [x15, #80] + ldp x1, x2, [sp, #128] + lsl x0, x1, #2 + ldp x6, x7, [sp, #160] + subs x0, x0, x6 + extr x1, x2, x1, #62 + sbcs x1, x1, x7 + ldp x3, x4, [sp, #144] + extr x2, x3, x2, #62 + ldp x6, x7, [sp, #176] + sbcs x2, x2, x6 + extr x3, x4, x3, #62 + sbcs x3, x3, x7 + lsr x4, x4, #62 + sbc x4, x4, xzr + add x5, x4, #0x1 + lsl x8, x5, #32 + negs x6, x8 + ngcs x7, xzr + sbc x8, x8, x5 + adds x0, x0, x5 + adcs x1, x1, x6 + adcs x2, x2, x7 + adcs x3, x3, x8 + csetm x5, cc + adds x0, x0, x5 + and x6, x5, #0xffffffff + adcs x1, x1, x6 + adcs x2, x2, xzr + neg x7, x6 + adc x3, x3, x7 + stp x0, x1, [x15] + stp x2, x3, [x15, #16] + mov x1, #0x8 + mov x2, #0xffffffffffffffff + ldp x9, x10, [sp] + subs x9, x2, x9 + mov x2, #0xffffffff + sbcs x10, x2, x10 + ldp x11, x12, [sp, #16] + ngcs x11, x11 + mov x2, #0xffffffff00000001 + sbc x12, x2, x12 + lsl x3, x9, #3 + extr x4, x10, x9, #61 + extr x5, x11, x10, #61 + extr x6, x12, x11, #61 + lsr x7, x12, #61 + mov x1, #0x3 + ldp x9, x10, [sp, #96] + mul x8, x9, x1 + umulh x9, x9, x1 + adds x3, x3, x8 + mul x8, x10, x1 + umulh x10, x10, x1 + adcs x4, x4, x8 + ldp x11, x12, [sp, #112] + mul x8, x11, x1 + umulh x11, x11, x1 + adcs x5, x5, x8 + mul x8, x12, x1 + umulh x12, x12, x1 + adcs x6, x6, x8 + adc x7, x7, xzr + adds x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + add x8, x7, #0x1 + lsl x10, x8, #32 + adds x6, x6, x10 + adc x7, x7, xzr + neg x9, x8 + sub x10, x10, #0x1 + subs x3, x3, x9 + sbcs x4, x4, x10 + sbcs x5, x5, xzr + sbcs x6, x6, x8 + sbc x8, x7, x8 + adds x3, x3, x8 + and x9, x8, #0xffffffff + adcs x4, x4, x9 + adcs x5, x5, xzr + neg x10, x9 + adc x6, x6, x10 + stp x3, x4, [x15, #32] + stp x5, x6, [x15, #48] + add sp, sp, #0xc0 + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h b/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h index 87ac773fa5..7af5249dee 100644 --- a/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h +++ b/third_party/s2n-bignum/include/s2n-bignum_aws-lc.h @@ -50,6 +50,19 @@ static inline uint8_t use_s2n_bignum_alt(void) { } #endif +// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +extern void bignum_montinv_p256(uint64_t z[static 4],const uint64_t x[static 4]); + +// Montgomery-Jacobian form scalar multiplication for P-256 +// Input scalar[4], point[12]; output res[12] +extern void p256_montjscalarmul(uint64_t res[static 12],const uint64_t scalar[static 4],const uint64_t point[static 12]); +extern void p256_montjscalarmul_alt(uint64_t res[static 12],const uint64_t scalar[static 4],const uint64_t point[static 12]); +static inline void p256_montjscalarmul_selector(uint64_t res[static 12], const uint64_t scalar[static 4], const uint64_t point[static 12]) { + if (use_s2n_bignum_alt()) { p256_montjscalarmul_alt(res, scalar, point); } + else { p256_montjscalarmul(res, scalar, point); } +} + // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced // Inputs x[6], y[6]; output z[6] extern void bignum_add_p384(uint64_t z[static 6], const uint64_t x[static 6], const uint64_t y[static 6]); @@ -63,7 +76,7 @@ static inline void bignum_deamont_p384_selector(uint64_t z[static 6], const uint else { bignum_deamont_p384(z, x); } } -// Montgomery multiply, z := (x * y / 2^384) mod p_384 +// Montgomery multiply, z := (x * y / 2^384) mod p_384 // Inputs x[6], y[6]; output z[6] extern void bignum_montmul_p384(uint64_t z[static 6], const uint64_t x[static 6], const uint64_t y[static 6]); extern void bignum_montmul_p384_alt(uint64_t z[static 6], const uint64_t x[static 6], const uint64_t y[static 6]); @@ -87,7 +100,7 @@ extern void bignum_neg_p384(uint64_t z[static 6], const uint64_t x[static 6]); // Subtract modulo p_384, z := (x - y) mod p_384 // Inputs x[6], y[6]; output z[6] -extern void bignum_sub_p384(uint64_t z[static 6], const uint64_t x[static 6], const uint64_t y[static 6]); +extern void bignum_sub_p384(uint64_t z[static 6], const uint64_t x[static 6], const uint64_t y[static 6]); // Convert to Montgomery form z := (2^384 * x) mod p_384 */ // Input x[6]; output z[6] */ diff --git a/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S new file mode 100644 index 0000000000..1ae2eabe65 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S @@ -0,0 +1,1633 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery inverse modulo p_256 = 2^256 - 2^224 + 2^192 + 2^96 - 1 +// Input x[4]; output z[4] +// +// extern void bignum_montinv_p256(uint64_t z[static 4],uint64_t x[static 4]); +// +// If the 4-digit input x is coprime to p_256, i.e. is not divisible +// by it, returns z < p_256 such that x * z == 2^512 (mod p_256). This +// is effectively "Montgomery inverse" because if we consider x and z as +// Montgomery forms of X and Z, i.e. x == 2^256 * X and z == 2^256 * Z +// (both mod p_256) then X * Z == 1 (mod p_256). That is, this function +// gives the analog of the modular inverse bignum_inv_p256 but with both +// input and output in the Montgomery domain. Note that x does not need +// to be reduced modulo p_256, but the output always is. If the input +// is divisible (i.e. is 0 or p_256), then there can be no solution to +// the congruence x * z == 2^512 (mod p_256), and z = 0 is returned. +// +// Standard x86-64 ABI: RDI = z, RSI = x +// Microsoft x64 ABI: RCX = z, RDX = x +// ---------------------------------------------------------------------------- +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_montinv_p256) + S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_montinv_p256) + .text + +// Size in bytes of a 64-bit word + +#define N 8 + +// Pointer-offset pairs for temporaries on stack + +#define f 0(%rsp) +#define g (5*N)(%rsp) +#define u (10*N)(%rsp) +#define v (15*N)(%rsp) +#define tmp (20*N)(%rsp) +#define tmp2 (21*N)(%rsp) +#define i (22*N)(%rsp) +#define d (23*N)(%rsp) + +#define mat (24*N)(%rsp) + +// Backup for the input pointer + +#define res (28*N)(%rsp) + +// Total size to reserve on the stack + +#define NSPACE (30*N) + +// Syntactic variants to make x86_att version simpler to generate + +#define F 0 +#define G (5*N) +#define U (10*N) +#define V (15*N) +#define MAT (24*N) + +#define ff (%rsp) +#define gg (5*N)(%rsp) + +// --------------------------------------------------------------------------- +// Core signed almost-Montgomery reduction macro from u[4..0] to u[3..0]. +// --------------------------------------------------------------------------- + +#define amontred(P) \ +/* We only know the input is -2^316 < x < 2^316. To do traditional */ \ +/* unsigned Montgomery reduction, start by adding 2^61 * p_256. */ \ + movq $0xe000000000000000, %r8 ; \ + addq P, %r8 ; \ + movq $0xffffffffffffffff, %r9 ; \ + adcq 8+P, %r9 ; \ + movq $0x000000001fffffff, %r10 ; \ + adcq 16+P, %r10 ; \ + movq $0x2000000000000000, %r11 ; \ + adcq 24+P, %r11 ; \ + movq $0x1fffffffe0000000, %r12 ; \ + adcq 32+P, %r12 ; \ +/* Let [%r8;%rbx] = 2^32 * w and [%rdx;%rax] = (2^64 - 2^32 + 1) * w */ \ +/* where w is the lowest word */ \ + movq %r8, %rbx ; \ + shlq $32, %rbx ; \ + movq $0xffffffff00000001, %rax ; \ + mulq %r8; \ + shrq $32, %r8 ; \ +/* Hence basic addition of (2^256 - 2^224 + 2^192 + 2^96) * w */ \ + addq %rbx, %r9 ; \ + adcq %r8, %r10 ; \ + adcq %rax, %r11 ; \ + adcq %rdx, %r12 ; \ +/* Now capture carry and subtract p_256 if set (almost-Montgomery) */ \ + sbbq %rax, %rax ; \ + movl $0x00000000ffffffff, %ebx ; \ + andq %rax, %rbx ; \ + movq $0xffffffff00000001, %rdx ; \ + andq %rax, %rdx ; \ + subq %rax, %r9 ; \ + movq %r9, P ; \ + sbbq %rbx, %r10 ; \ + movq %r10, 8+P ; \ + sbbq $0, %r11 ; \ + movq %r11, 16+P ; \ + sbbq %rdx, %r12 ; \ + movq %r12, 24+P + +// Very similar to a subroutine call to the s2n-bignum word_divstep59. +// But different in register usage and returning the final matrix as +// +// [ %r8 %r10] +// [ %r12 %r14] +// +// and also returning the matrix still negated (which doesn't matter) + +#define divstep59(din,fin,gin) \ + movq din, %rsi ; \ + movq fin, %rdx ; \ + movq gin, %rcx ; \ + movq %rdx, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + xorl %ebp, %ebp ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %rdx ; \ + leaq (%rcx,%rax), %rdi ; \ + shlq $0x16, %rdx ; \ + shlq $0x16, %rdi ; \ + sarq $0x2b, %rdx ; \ + sarq $0x2b, %rdi ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %rbx ; \ + leaq (%rcx,%rax), %rcx ; \ + sarq $0x2a, %rbx ; \ + sarq $0x2a, %rcx ; \ + movq %rdx, MAT(%rsp) ; \ + movq %rbx, MAT+0x8(%rsp) ; \ + movq %rdi, MAT+0x10(%rsp) ; \ + movq %rcx, MAT+0x18(%rsp) ; \ + movq fin, %r12 ; \ + imulq %r12, %rdi ; \ + imulq %rdx, %r12 ; \ + movq gin, %r13 ; \ + imulq %r13, %rbx ; \ + imulq %rcx, %r13 ; \ + addq %rbx, %r12 ; \ + addq %rdi, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r10 ; \ + shlq $0x16, %r8 ; \ + shlq $0x16, %r10 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r10 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r15 ; \ + leaq (%rcx,%rax), %r11 ; \ + sarq $0x2a, %r15 ; \ + sarq $0x2a, %r11 ; \ + movq %r13, %rbx ; \ + movq %r12, %rcx ; \ + imulq %r8, %r12 ; \ + imulq %r15, %rbx ; \ + addq %rbx, %r12 ; \ + imulq %r11, %r13 ; \ + imulq %r10, %rcx ; \ + addq %rcx, %r13 ; \ + sarq $0x14, %r12 ; \ + sarq $0x14, %r13 ; \ + movq %r12, %rbx ; \ + andq $0xfffff, %rbx ; \ + movabsq $0xfffffe0000000000, %rax ; \ + orq %rax, %rbx ; \ + movq %r13, %rcx ; \ + andq $0xfffff, %rcx ; \ + movabsq $0xc000000000000000, %rax ; \ + orq %rax, %rcx ; \ + movq MAT(%rsp), %rax ; \ + imulq %r8, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r15, %rdx ; \ + imulq MAT+0x8(%rsp), %r8 ; \ + imulq MAT+0x18(%rsp), %r15 ; \ + addq %r8, %r15 ; \ + leaq (%rax,%rdx), %r9 ; \ + movq MAT(%rsp), %rax ; \ + imulq %r10, %rax ; \ + movq MAT+0x10(%rsp), %rdx ; \ + imulq %r11, %rdx ; \ + imulq MAT+0x8(%rsp), %r10 ; \ + imulq MAT+0x18(%rsp), %r11 ; \ + addq %r10, %r11 ; \ + leaq (%rax,%rdx), %r13 ; \ + movq $0xfffffffffffffffe, %rax ; \ + movl $0x2, %edx ; \ + movq %rbx, %rdi ; \ + movq %rax, %r8 ; \ + testq %rsi, %rsi ; \ + cmovs %rbp, %r8 ; \ + testq $0x1, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + cmovs %rbp, %r8 ; \ + movq %rbx, %rdi ; \ + testq %rdx, %rcx ; \ + cmoveq %rbp, %r8 ; \ + cmoveq %rbp, %rdi ; \ + sarq $1, %rcx ; \ + xorq %r8, %rdi ; \ + xorq %r8, %rsi ; \ + btq $0x3f, %r8 ; \ + cmovbq %rcx, %rbx ; \ + movq %rax, %r8 ; \ + subq %rax, %rsi ; \ + leaq (%rcx,%rdi), %rcx ; \ + sarq $1, %rcx ; \ + movl $0x100000, %eax ; \ + leaq (%rbx,%rax), %r8 ; \ + leaq (%rcx,%rax), %r12 ; \ + shlq $0x15, %r8 ; \ + shlq $0x15, %r12 ; \ + sarq $0x2b, %r8 ; \ + sarq $0x2b, %r12 ; \ + movabsq $0x20000100000, %rax ; \ + leaq (%rbx,%rax), %r10 ; \ + leaq (%rcx,%rax), %r14 ; \ + sarq $0x2b, %r10 ; \ + sarq $0x2b, %r14 ; \ + movq %r9, %rax ; \ + imulq %r8, %rax ; \ + movq %r13, %rdx ; \ + imulq %r10, %rdx ; \ + imulq %r15, %r8 ; \ + imulq %r11, %r10 ; \ + addq %r8, %r10 ; \ + leaq (%rax,%rdx), %r8 ; \ + movq %r9, %rax ; \ + imulq %r12, %rax ; \ + movq %r13, %rdx ; \ + imulq %r14, %rdx ; \ + imulq %r15, %r12 ; \ + imulq %r11, %r14 ; \ + addq %r12, %r14 ; \ + leaq (%rax,%rdx), %r12 + +S2N_BN_SYMBOL(bignum_montinv_p256): + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +// Save registers and make room for temporaries + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $NSPACE, %rsp + +// Save the return pointer for the end so we can overwrite %rdi later + + movq %rdi, res + +// Create constant [%rdx;%rcx;%rbx;%rax] = p_256 and copy it into the variable f +// including the 5th zero digit + + xorl %ecx, %ecx + movl $0x00000000ffffffff, %edx + movq %rdx, %rbx + leaq -1(%rcx), %rax + negq %rdx + movq %rax, F(%rsp) + movq %rbx, F+8(%rsp) + movq %rcx, F+16(%rsp) + movq %rdx, F+24(%rsp) + movq %rcx, F+32(%rsp) + +// Now reduce the input modulo p_256, first negating the constant to get +// [%rdx;%rcx;%rbx;%rax] = 2^256 - p_256, adding it to x and hence getting +// the comparison x < p_256 <=> (2^256 - p_256) + x < 2^256 and choosing +// g accordingly. + + movq (%rsi), %r8 + movq 8(%rsi), %r9 + movq 16(%rsi), %r10 + movq 24(%rsi), %r11 + + leaq 1(%rcx), %rax + addq %r8, %rax + leaq -1(%rdx), %rbx + adcq %r9, %rbx + notq %rcx + adcq %r10, %rcx + notq %rdx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq %rax, G(%rsp) + movq %rbx, G+8(%rsp) + movq %rcx, G+16(%rsp) + movq %rdx, G+24(%rsp) + xorl %eax, %eax + movq %rax, G+32(%rsp) + +// Also maintain reduced < 2^256 vector [u,v] such that +// [f,g] == x * 2^{5*i-562} * [u,v] (mod p_256) +// starting with [p_256,x] == x * 2^{5*0-562} * [0,2^562] (mod p_256) +// The weird-looking 5*i modifications come in because we are doing +// 64-bit word-sized Montgomery reductions at each stage, which is +// 5 bits more than the 59-bit requirement to keep things stable. +// After the 10th and last iteration and sign adjustment, when +// f == 1 for in-scope cases, we have x * 2^{50-562} * u == 1, i.e. +// x * u == 2^512 as required. + + xorl %eax, %eax + movq %rax, U(%rsp) + movq %rax, U+8(%rsp) + movq %rax, U+16(%rsp) + movq %rax, U+24(%rsp) + + movq $0x000c000000140000, %rax + movq %rax, V(%rsp) + movq $0xffe8000000000000, %rax + movq %rax, V+8(%rsp) + movq $0xfffbffffffefffff, %rax + movq %rax, V+16(%rsp) + movq $0x000bffffffebffff, %rax + movq %rax, V+24(%rsp) + +// Start of main loop. We jump into the middle so that the divstep +// portion is common to the special tenth iteration after a uniform +// first 9. + + movq $10, i + movq $1, d + jmp midloop + +loop: + +// Separate out the matrix into sign-magnitude pairs + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + +// Adjust the initial values to allow for complement instead of negation +// This initial offset is the same for [f,g] and [u,v] compositions. +// Save it in temporary storage for the [u,v] part and do [f,g] first. + + movq %r8, %rax + andq %r9, %rax + movq %r10, %rdi + andq %r11, %rdi + addq %rax, %rdi + movq %rdi, tmp + + movq %r12, %rax + andq %r13, %rax + movq %r14, %rsi + andq %r15, %rsi + addq %rax, %rsi + movq %rsi, tmp2 + +// Now the computation of the updated f and g values. This maintains a +// 2-word carry between stages so we can conveniently insert the shift +// right by 59 before storing back, and not overwrite digits we need +// again of the old f and g values. +// +// Digit 0 of [f,g] + + xorl %ebx, %ebx + movq F(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rdi + adcq %rdx, %rbx + movq G(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rdi + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq F(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rbp + movq G(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + +// Digit 1 of [f,g] + + xorl %ecx, %ecx + movq F+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq G+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + shrdq $59, %rbx, %rdi + movq %rdi, F(%rsp) + + xorl %edi, %edi + movq F+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rbp + adcq %rdx, %rdi + movq G+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rdi + shrdq $59, %rbp, %rsi + movq %rsi, G(%rsp) + +// Digit 2 of [f,g] + + xorl %esi, %esi + movq F+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rsi + movq G+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rsi + shrdq $59, %rcx, %rbx + movq %rbx, F+N(%rsp) + + xorl %ebx, %ebx + movq F+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + addq %rax, %rdi + adcq %rdx, %rbx + movq G+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rdi + adcq %rdx, %rbx + shrdq $59, %rdi, %rbp + movq %rbp, G+N(%rsp) + +// Digits 3 and 4 of [f,g] + + movq F+3*N(%rsp), %rax + xorq %r9, %rax + movq F+4*N(%rsp), %rbp + xorq %r9, %rbp + andq %r8, %rbp + negq %rbp + mulq %r8 + addq %rax, %rsi + adcq %rdx, %rbp + movq G+3*N(%rsp), %rax + xorq %r11, %rax + movq G+4*N(%rsp), %rdx + xorq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbp + mulq %r10 + addq %rax, %rsi + adcq %rdx, %rbp + shrdq $59, %rsi, %rcx + movq %rcx, F+2*N(%rsp) + shrdq $59, %rbp, %rsi + sarq $59, %rbp + + movq F+3*N(%rsp), %rax + movq %rsi, F+3*N(%rsp) + + movq F+4*N(%rsp), %rsi + movq %rbp, F+4*N(%rsp) + + xorq %r13, %rax + xorq %r13, %rsi + andq %r12, %rsi + negq %rsi + mulq %r12 + addq %rax, %rbx + adcq %rdx, %rsi + movq G+3*N(%rsp), %rax + xorq %r15, %rax + movq G+4*N(%rsp), %rdx + xorq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rsi + mulq %r14 + addq %rax, %rbx + adcq %rdx, %rsi + shrdq $59, %rbx, %rdi + movq %rdi, G+2*N(%rsp) + shrdq $59, %rsi, %rbx + movq %rbx, G+3*N(%rsp) + sarq $59, %rsi + movq %rsi, G+4*N(%rsp) + +// Get the initial carries back from storage and do the [u,v] accumulation + + movq tmp, %rbx + movq tmp2, %rbp + +// Digit 0 of [u,v] + + xorl %ecx, %ecx + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V(%rsp) + +// Digit 1 of [u,v] + + xorl %ebx, %ebx + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rcx + adcq %rdx, %rbx + + xorl %ebp, %ebp + movq U+N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rcx, U+N(%rsp) + addq %rax, %rsi + adcq %rdx, %rbp + movq V+N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rsi + adcq %rdx, %rbp + movq %rsi, V+N(%rsp) + +// Digit 2 of [u,v] + + xorl %ecx, %ecx + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %rbx + adcq %rdx, %rcx + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %rbx + adcq %rdx, %rcx + + xorl %esi, %esi + movq U+2*N(%rsp), %rax + xorq %r13, %rax + mulq %r12 + movq %rbx, U+2*N(%rsp) + addq %rax, %rbp + adcq %rdx, %rsi + movq V+2*N(%rsp), %rax + xorq %r15, %rax + mulq %r14 + addq %rax, %rbp + adcq %rdx, %rsi + movq %rbp, V+2*N(%rsp) + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + movq %r9, %rbx + andq %r8, %rbx + negq %rbx + mulq %r8 + addq %rax, %rcx + adcq %rdx, %rbx + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %rbx + mulq %r10 + addq %rax, %rcx + adcq %rbx, %rdx + +// Preload for last use of old u digit 3 + + movq U+3*N(%rsp), %rax + movq %rcx, U+3*N(%rsp) + movq %rdx, U+4*N(%rsp) + +// Digits 3 and 4 of v (top is unsigned) + + xorq %r13, %rax + movq %r13, %rcx + andq %r12, %rcx + negq %rcx + mulq %r12 + addq %rax, %rsi + adcq %rdx, %rcx + movq V+3*N(%rsp), %rax + xorq %r15, %rax + movq %r15, %rdx + andq %r14, %rdx + subq %rdx, %rcx + mulq %r14 + addq %rax, %rsi + adcq %rcx, %rdx + movq %rsi, V+3*N(%rsp) + movq %rdx, V+4*N(%rsp) + +// Montgomery reduction of u + + amontred(u) + +// Montgomery reduction of v + + amontred(v) + +midloop: + + divstep59(d,ff,gg) + movq %rsi, d + +// Next iteration + + decq i + jnz loop + +// The 10th and last iteration does not need anything except the +// u value and the sign of f; the latter can be obtained from the +// lowest word of f. So it's done differently from the main loop. +// Find the sign of the new f. For this we just need one digit +// since we know (for in-scope cases) that f is either +1 or -1. +// We don't explicitly shift right by 59 either, but looking at +// bit 63 (or any bit >= 60) of the unshifted result is enough +// to distinguish -1 from +1; this is then made into a mask. + + movq F(%rsp), %rax + movq G(%rsp), %rcx + imulq %r8, %rax + imulq %r10, %rcx + addq %rcx, %rax + sarq $63, %rax + +// Now separate out the matrix into sign-magnitude pairs +// and adjust each one based on the sign of f. +// +// Note that at this point we expect |f|=1 and we got its +// sign above, so then since [f,0] == x * 2^{-512} [u,v] (mod p_256) +// we want to flip the sign of u according to that of f. + + movq %r8, %r9 + sarq $63, %r9 + xorq %r9, %r8 + subq %r9, %r8 + xorq %rax, %r9 + + movq %r10, %r11 + sarq $63, %r11 + xorq %r11, %r10 + subq %r11, %r10 + xorq %rax, %r11 + + movq %r12, %r13 + sarq $63, %r13 + xorq %r13, %r12 + subq %r13, %r12 + xorq %rax, %r13 + + movq %r14, %r15 + sarq $63, %r15 + xorq %r15, %r14 + subq %r15, %r14 + xorq %rax, %r15 + +// Adjust the initial value to allow for complement instead of negation + + movq %r8, %rax + andq %r9, %rax + movq %r10, %r12 + andq %r11, %r12 + addq %rax, %r12 + +// Digit 0 of [u] + + xorl %r13d, %r13d + movq U(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r12 + adcq %rdx, %r13 + movq V(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r12 + adcq %rdx, %r13 + +// Digit 1 of [u] + + xorl %r14d, %r14d + movq U+N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r13 + adcq %rdx, %r14 + movq V+N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r13 + adcq %rdx, %r14 + +// Digit 2 of [u] + + xorl %r15d, %r15d + movq U+2*N(%rsp), %rax + xorq %r9, %rax + mulq %r8 + addq %rax, %r14 + adcq %rdx, %r15 + movq V+2*N(%rsp), %rax + xorq %r11, %rax + mulq %r10 + addq %rax, %r14 + adcq %rdx, %r15 + +// Digits 3 and 4 of u (top is unsigned) + + movq U+3*N(%rsp), %rax + xorq %r9, %rax + andq %r8, %r9 + negq %r9 + mulq %r8 + addq %rax, %r15 + adcq %rdx, %r9 + movq V+3*N(%rsp), %rax + xorq %r11, %rax + movq %r11, %rdx + andq %r10, %rdx + subq %rdx, %r9 + mulq %r10 + addq %rax, %r15 + adcq %rdx, %r9 + +// Store back and Montgomery reduce u + + movq %r12, U(%rsp) + movq %r13, U+N(%rsp) + movq %r14, U+2*N(%rsp) + movq %r15, U+3*N(%rsp) + movq %r9, U+4*N(%rsp) + + amontred(u) + +// Perform final strict reduction mod p_256 and copy to output + + movq U(%rsp), %r8 + movq U+N(%rsp), %r9 + movq U+2*N(%rsp), %r10 + movq U+3*N(%rsp), %r11 + + movl $1, %eax + movl $0xffffffff, %ebx + leaq -2(%rax), %rcx + leaq -1(%rbx), %rdx + notq %rbx + + addq %r8, %rax + adcq %r9, %rbx + adcq %r10, %rcx + adcq %r11, %rdx + + cmovncq %r8, %rax + cmovncq %r9, %rbx + cmovncq %r10, %rcx + cmovncq %r11, %rdx + + movq res, %rdi + movq %rax, (%rdi) + movq %rbx, N(%rdi) + movq %rcx, 2*N(%rdi) + movq %rdx, 3*N(%rdi) + +// Restore stack and registers + + addq $NSPACE, %rsp + + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + +#if WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S new file mode 100644 index 0000000000..1a36a4c784 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S @@ -0,0 +1,3550 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for P-256 +// Input scalar[4], point[12]; output res[12] +// +// extern void p256_montjscalarmul +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version p256_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_256 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_256) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjscalarmul) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjscalarmul) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. Uppercase syntactic variants +// make x86_att version simpler to generate. + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define res (31*NUMSIZE)(%rsp) + +#define NSPACE (32*NUMSIZE) + +S2N_BN_SYMBOL(p256_montjscalarmul): + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq p256_montjscalarmul_standard + popq %rsi + popq %rdi + ret + +p256_montjscalarmul_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + movq %rdx, %rbx + movq %rdi, res + +// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12] + + movq $0xf3b9cac2fc632551, %r12 + movq $0xbce6faada7179e84, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xffffffff00000000, %r15 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign in %rbp so we can +// correspondingly negate the point below. + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %r11, %rbp + shrq $63, %rbp + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + movq $0x8888888888888888, %rax + addq %rax, %r8 + adcq %rax, %r9 + adcq %rax, %r10 + adcq %rax, %r11 + btc $63, %r11 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_256 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + movq (%rbx), %rax + movq %rax, TAB(%rsp) + movq 8(%rbx), %rax + movq %rax, TAB+8(%rsp) + movq 16(%rbx), %rax + movq %rax, TAB+16(%rsp) + movq 24(%rbx), %rax + movq %rax, TAB+24(%rsp) + + movq 32(%rbx), %r12 + movq %r12, %rax + movq 40(%rbx), %r13 + orq %r13, %rax + movq 48(%rbx), %r14 + movq %r14, %rcx + movq 56(%rbx), %r15 + orq %r15, %rcx + orq %rcx, %rax + cmovzq %rax, %rbp + + xorl %r10d, %r10d + leaq -1(%r10), %r8 + movq $0x00000000ffffffff, %r11 + movq %r11, %r9 + negq %r11 + subq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + sbbq %r15, %r11 + testq %rbp, %rbp + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + movq %r8, TAB+32(%rsp) + movq %r9, TAB+40(%rsp) + movq %r10, TAB+48(%rsp) + movq %r11, TAB+56(%rsp) + + movq 64(%rbx), %rax + movq %rax, TAB+64(%rsp) + movq 72(%rbx), %rax + movq %rax, TAB+72(%rsp) + movq 80(%rbx), %rax + movq %rax, TAB+80(%rsp) + movq 88(%rbx), %rax + movq %rax, TAB+88(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + leaq TAB+96*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq local_p256_montjdouble + + leaq TAB+96*2(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq local_p256_montjadd + + leaq TAB+96*3(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + callq local_p256_montjdouble + + leaq TAB+96*4(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq local_p256_montjadd + + leaq TAB+96*5(%rsp), %rdi + leaq TAB+96*2(%rsp), %rsi + callq local_p256_montjdouble + + leaq TAB+96*6(%rsp), %rdi + leaq TAB+96*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq local_p256_montjadd + + leaq TAB+96*7(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + callq local_p256_montjdouble + +// Set up accumulator as table entry for top 4 bits (constant-time indexing) + + movq SCALARB+24(%rsp), %rdi + shrq $60, %rdi + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + movq %rax, ACC(%rsp) + movq %rbx, ACC+8(%rsp) + movq %rcx, ACC+16(%rsp) + movq %rdx, ACC+24(%rsp) + movq %r8, ACC+32(%rsp) + movq %r9, ACC+40(%rsp) + movq %r10, ACC+48(%rsp) + movq %r11, ACC+56(%rsp) + movq %r12, ACC+64(%rsp) + movq %r13, ACC+72(%rsp) + movq %r14, ACC+80(%rsp) + movq %r15, ACC+88(%rsp) + +// Main loop over size-4 bitfield + + movl $252, %ebp + +loop: + subq $4, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + movq %rbp, %rax + shrq $6, %rax + movq (%rsp,%rax,8), %rdi + movq %rbp, %rcx + shrq %cl, %rdi + andq $15, %rdi + + subq $8, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_256 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq %r12, TABENT+64(%rsp) + movq %r13, TABENT+72(%rsp) + movq %r14, TABENT+80(%rsp) + movq %r15, TABENT+88(%rsp) + + movq %r8, %rax + xorl %r14d, %r14d + orq %r9, %rax + leaq -1(%r14), %r12 + movq %r10, %rcx + movq $0x00000000ffffffff, %r15 + orq %r11, %rcx + movq %r15, %r13 + negq %r15 + orq %rcx, %rax + cmovzq %rax, %rsi + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + testq %rsi, %rsi + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjadd + + testq %rbp, %rbp + jne loop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + movq res, %rdi + movq ACC(%rsp), %rax + movq %rax, (%rdi) + movq ACC+8(%rsp), %rax + movq %rax, 8(%rdi) + movq ACC+16(%rsp), %rax + movq %rax, 16(%rdi) + movq ACC+24(%rsp), %rax + movq %rax, 24(%rdi) + + movq ACC+32(%rsp), %rax + movq %rax, 32(%rdi) + movq ACC+40(%rsp), %rax + movq %rax, 40(%rdi) + movq ACC+48(%rsp), %rax + movq %rax, 48(%rdi) + movq ACC+56(%rsp), %rax + movq %rax, 56(%rdi) + + movq ACC+64(%rsp), %rax + movq %rax, 64(%rdi) + movq ACC+72(%rsp), %rax + movq %rax, 72(%rdi) + movq ACC+80(%rsp), %rax + movq %rax, 80(%rdi) + movq ACC+88(%rsp), %rax + movq %rax, 88(%rdi) + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +local_p256_montjadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xe0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x40(%rbp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rbp), %r9, %r10 + mulxq 0x58(%rbp), %r11, %r12 + movq 0x50(%rbp), %rdx + mulxq 0x58(%rbp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rbp), %rdx + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rbp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rbp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rbp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsi), %rdx + mulxq 0x40(%rbp), %r8, %r9 + mulxq 0x48(%rbp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rbp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rbp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rbp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rbp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rbp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rbp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rbp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + xorl %r13d, %r13d + movq 0x20(%rbp), %rdx + mulxq 0x40(%rsi), %r8, %r9 + mulxq 0x48(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rbp), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rbp), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rbp), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %r13d, %r13d + movq 0x0(%rbp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rbp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rbp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rbp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + xorl %r13d, %r13d + movq (%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x8(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x10(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x18(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsp), %rdx + mulxq (%rsp), %r8, %r9 + mulxq 0x8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsp), %rdx + xorl %r14d, %r14d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsp), %rdx + xorl %r8d, %r8d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsp), %rdx + xorl %r9d, %r9d + mulxq (%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + xorl %r13d, %r13d + movq 0xc0(%rsp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0xc8(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0xd0(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0xd8(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0xc0(%rsp), %rax + movq 0x28(%rsp), %rcx + sbbq 0xc8(%rsp), %rcx + movq 0x30(%rsp), %r8 + sbbq 0xd0(%rsp), %r8 + movq 0x38(%rsp), %r9 + sbbq 0xd8(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0xa8(%rsp), %r9, %r10 + mulxq 0xb8(%rsp), %r11, %r12 + movq 0xb0(%rsp), %rdx + mulxq 0xb8(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0xb8(%rsp), %rdx + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0xa8(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0xb0(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0xb8(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %r8d + leaq -0x1(%rdx), %rdx + leaq -0x1(%rcx), %rax + movl $0xfffffffe, %r11d + cmoveq %rcx, %r8 + cmoveq %rcx, %rdx + cmoveq %rcx, %rax + cmoveq %rcx, %r11 + addq %r8, %r12 + adcq %rdx, %r13 + adcq %rax, %r14 + adcq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ecx, %ecx + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + adoxq %rcx, %r14 + adcq %rcx, %r14 + xorl %ecx, %ecx + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rcx, %r15 + adoxq %rcx, %r15 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rcx, %r13 + movl %ecx, %r9d + adoxq %rcx, %r9 + adcxq %rcx, %r9 + addq %r9, %r14 + adcq %rcx, %r15 + movl %ecx, %r8d + adcq %rcx, %r8 + xorl %ecx, %ecx + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rcx, %r15 + adoxq %rcx, %r8 + adcq %rcx, %r8 + movl $0x1, %ebx + addq %r12, %rbx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rcx), %rcx + movq %rcx, %rax + adcq %r14, %rcx + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rbx, %r12 + cmovbq %rdx, %r13 + cmovbq %rcx, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq 0x80(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x88(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x90(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x98(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x40(%rsi), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rsi), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rsi), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rsi), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + xorl %r13d, %r13d + movq 0xc0(%rsp), %rdx + mulxq 0x60(%rsp), %r8, %r9 + mulxq 0x68(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x70(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x78(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0xc8(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x78(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0xd0(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0xd8(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x70(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x78(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorl %r13d, %r13d + movq 0x40(%rbp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x48(%rbp), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x50(%rbp), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x58(%rbp), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + xorl %r13d, %r13d + movq 0x80(%rsp), %rdx + mulxq 0x20(%rsp), %r8, %r9 + mulxq 0x28(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x30(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x38(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x88(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x38(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x90(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x98(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x30(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x38(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %r8 + movq 0x48(%rsi), %r9 + movq 0x50(%rsi), %r10 + movq 0x58(%rsi), %r11 + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + movq 0x40(%rbp), %r12 + movq 0x48(%rbp), %r13 + movq 0x50(%rbp), %r14 + movq 0x58(%rbp), %r15 + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + cmpq %rax, %rbx + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + cmoveq 0xa0(%rsp), %r12 + cmoveq 0xa8(%rsp), %r13 + cmoveq 0xb0(%rsp), %r14 + cmoveq 0xb8(%rsp), %r15 + movq (%rsp), %rax + cmovbq (%rsi), %rax + cmova 0x0(%rbp), %rax + movq 0x8(%rsp), %rbx + cmovbq 0x8(%rsi), %rbx + cmova 0x8(%rbp), %rbx + movq 0x10(%rsp), %rcx + cmovbq 0x10(%rsi), %rcx + cmova 0x10(%rbp), %rcx + movq 0x18(%rsp), %rdx + cmovbq 0x18(%rsi), %rdx + cmova 0x18(%rbp), %rdx + movq 0x80(%rsp), %r8 + cmovbq 0x20(%rsi), %r8 + cmova 0x20(%rbp), %r8 + movq 0x88(%rsp), %r9 + cmovbq 0x28(%rsi), %r9 + cmova 0x28(%rbp), %r9 + movq 0x90(%rsp), %r10 + cmovbq 0x30(%rsi), %r10 + cmova 0x30(%rbp), %r10 + movq 0x98(%rsp), %r11 + cmovbq 0x38(%rsi), %r11 + cmova 0x38(%rbp), %r11 + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + movq %r8, 0x20(%rdi) + movq %r9, 0x28(%rdi) + movq %r10, 0x30(%rdi) + movq %r11, 0x38(%rdi) + movq %r12, 0x40(%rdi) + movq %r13, 0x48(%rdi) + movq %r14, 0x50(%rdi) + movq %r15, 0x58(%rdi) + addq $0xe0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +local_p256_montjdouble: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq 0x40(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsi), %r9, %r10 + mulxq 0x58(%rsi), %r11, %r12 + movq 0x50(%rsi), %rdx + mulxq 0x58(%rsi), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x40(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsi), %rdx + mulxq 0x48(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rsi), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsi), %r9, %r10 + mulxq 0x38(%rsi), %r11, %r12 + movq 0x30(%rsi), %rdx + mulxq 0x38(%rsi), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x20(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsi), %rdx + mulxq 0x28(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsi), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsi), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq (%rsi), %rax + subq (%rsp), %rax + movq 0x8(%rsi), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq (%rsi), %rax + addq (%rsp), %rax + movq 0x8(%rsi), %rcx + adcq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + adcq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + subq %r11, %rax + movq %rax, 0x40(%rsp) + sbbq %r10, %rcx + movq %rcx, 0x48(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x50(%rsp) + sbbq %rdx, %r9 + movq %r9, 0x58(%rsp) + xorl %r13d, %r13d + movq 0x60(%rsp), %rdx + mulxq 0x40(%rsp), %r8, %r9 + mulxq 0x48(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x50(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x58(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x68(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x58(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x70(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x58(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x78(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x50(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x58(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorq %r11, %r11 + movq 0x20(%rsi), %rax + addq 0x40(%rsi), %rax + movq 0x28(%rsi), %rcx + adcq 0x48(%rsi), %rcx + movq 0x30(%rsi), %r8 + adcq 0x50(%rsi), %r8 + movq 0x38(%rsi), %r9 + adcq 0x58(%rsi), %r9 + adcq %r11, %r11 + subq $0xffffffffffffffff, %rax + movl $0xffffffff, %r10d + sbbq %r10, %rcx + sbbq $0x0, %r8 + movq $0xffffffff00000001, %rdx + sbbq %rdx, %r9 + sbbq $0x0, %r11 + andq %r11, %r10 + andq %r11, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + xorl %r13d, %r13d + movq 0x20(%rsp), %rdx + mulxq (%rsi), %r8, %r9 + mulxq 0x8(%rsi), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0x10(%rsi), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0x18(%rsi), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x28(%rsp), %rdx + xorl %r14d, %r14d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x18(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x30(%rsp), %rdx + xorl %r8d, %r8d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x38(%rsp), %rdx + xorl %r9d, %r9d + mulxq (%rsi), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0x8(%rsi), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0x10(%rsi), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0x18(%rsi), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x60(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x68(%rsp), %r9, %r10 + mulxq 0x78(%rsp), %r11, %r12 + movq 0x70(%rsp), %rdx + mulxq 0x78(%rsp), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x60(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x78(%rsp), %rdx + mulxq 0x68(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x68(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x70(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x78(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x40(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x48(%rsp), %r9, %r10 + mulxq 0x58(%rsp), %r11, %r12 + movq 0x50(%rsp), %rdx + mulxq 0x58(%rsp), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x40(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x58(%rsp), %rdx + mulxq 0x48(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x48(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x50(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x58(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq $0xffffffffffffffff, %r8 + xorl %r10d, %r10d + subq 0xa0(%rsp), %r8 + movq $0xffffffff, %r9 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + movq $0xffffffff00000001, %r11 + sbbq 0xb8(%rsp), %r11 + xorl %r12d, %r12d + movq $0x9, %rdx + mulxq %r8, %r8, %rax + mulxq %r9, %r9, %rcx + addq %rax, %r9 + mulxq %r10, %r10, %rax + adcq %rcx, %r10 + mulxq %r11, %r11, %rcx + adcq %rax, %r11 + adcq %rcx, %r12 + movq $0xc, %rdx + xorl %eax, %eax + mulxq 0x80(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x88(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x90(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x98(%rsp), %rax, %rdx + adcxq %rax, %r11 + adoxq %r12, %rdx + adcq $0x1, %rdx + addq %rdx, %r8 + movq $0x100000000, %rax + mulxq %rax, %rax, %rcx + sbbq $0x0, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + movq $0xffffffff00000001, %rax + mulxq %rax, %rax, %rcx + sbbq %rax, %r11 + sbbq %rcx, %rdx + decq %rdx + movl $0xffffffff, %eax + andq %rdx, %rax + xorl %ecx, %ecx + subq %rax, %rcx + addq %rdx, %r8 + movq %r8, 0xa0(%rsp) + adcq %rax, %r9 + movq %r9, 0xa8(%rsp) + adcq $0x0, %r10 + movq %r10, 0xb0(%rsp) + adcq %rcx, %r11 + movq %r11, 0xb8(%rsp) + movq 0x40(%rsp), %rax + subq (%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rdx + mulxq %rdx, %r8, %r15 + mulxq 0x28(%rsp), %r9, %r10 + mulxq 0x38(%rsp), %r11, %r12 + movq 0x30(%rsp), %rdx + mulxq 0x38(%rsp), %r13, %r14 + xorl %ebp, %ebp + mulxq 0x20(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + movq 0x38(%rsp), %rdx + mulxq 0x28(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + adoxq %rbp, %r14 + adcq %rbp, %r14 + xorl %ebp, %ebp + adcxq %r9, %r9 + adoxq %r15, %r9 + movq 0x28(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r10, %r10 + adoxq %rax, %r10 + adcxq %r11, %r11 + adoxq %rdx, %r11 + movq 0x30(%rsp), %rdx + mulxq %rdx, %rax, %rdx + adcxq %r12, %r12 + adoxq %rax, %r12 + adcxq %r13, %r13 + adoxq %rdx, %r13 + movq 0x38(%rsp), %rdx + mulxq %rdx, %rax, %r15 + adcxq %r14, %r14 + adoxq %rax, %r14 + adcxq %rbp, %r15 + adoxq %rbp, %r15 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + movq $0xffffffff00000001, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %rbp, %r13 + movl %ebp, %r9d + adoxq %rbp, %r9 + adcxq %rbp, %r9 + addq %r9, %r14 + adcq %rbp, %r15 + movl %ebp, %r8d + adcq %rbp, %r8 + xorl %ebp, %ebp + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + movq $0xffffffff00000001, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %rbp, %r15 + adoxq %rbp, %r8 + adcq %rbp, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rdx), %rdx + adcq %r13, %rdx + leaq -0x1(%rbp), %rbp + movq %rbp, %rax + adcq %r14, %rbp + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %rbp, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + xorl %r13d, %r13d + movq 0x60(%rsp), %rdx + mulxq 0xa0(%rsp), %r8, %r9 + mulxq 0xa8(%rsp), %rbx, %r10 + adcq %rbx, %r9 + mulxq 0xb0(%rsp), %rbx, %r11 + adcq %rbx, %r10 + mulxq 0xb8(%rsp), %rbx, %r12 + adcq %rbx, %r11 + adcq %r13, %r12 + movq 0x68(%rsp), %rdx + xorl %r14d, %r14d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcq %r14, %r13 + xorl %r15d, %r15d + movq $0x100000000, %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r9 + adoxq %rbx, %r10 + mulxq %r9, %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r8, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r9, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adcxq %r15, %r13 + adoxq %r15, %r14 + adcq %r15, %r14 + movq 0x70(%rsp), %rdx + xorl %r8d, %r8d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r10 + adoxq %rbx, %r11 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + adoxq %r8, %r14 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r13 + adcq %rbx, %r14 + adcq %r8, %r15 + movq 0x78(%rsp), %rdx + xorl %r9d, %r9d + mulxq 0xa0(%rsp), %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq 0xa8(%rsp), %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + mulxq 0xb0(%rsp), %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + adoxq %r9, %r15 + mulxq 0xb8(%rsp), %rax, %rbx + adcq %rax, %r14 + adcq %rbx, %r15 + adcq %r9, %r8 + xorl %r9d, %r9d + movq $0x100000000, %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r11 + adoxq %rbx, %r12 + mulxq %r11, %rax, %rbx + adcxq %rax, %r12 + adoxq %rbx, %r13 + notq %rdx + leaq 0x2(%rdx), %rdx + mulxq %r10, %rax, %rbx + adcxq %rax, %r13 + adoxq %rbx, %r14 + mulxq %r11, %rax, %rbx + adcxq %rax, %r14 + adoxq %rbx, %r15 + adcxq %r9, %r15 + adoxq %r9, %r8 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rdx + adcq %r13, %rdx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rdx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rsp), %rax + subq 0x20(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x28(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x30(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x38(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rdi) + adcq %r10, %rcx + movq %rcx, 0x48(%rdi) + adcq $0x0, %r8 + movq %r8, 0x50(%rdi) + adcq %rdx, %r9 + movq %r9, 0x58(%rdi) + movq 0x98(%rsp), %r11 + movq %r11, %rdx + movq 0x90(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0x88(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0x80(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + shrq $0x3e, %rdx + addq $0x1, %rdx + subq 0xa0(%rsp), %r8 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + sbbq 0xb8(%rsp), %r11 + sbbq $0x0, %rdx + addq %rdx, %r8 + movq $0x100000000, %rax + mulxq %rax, %rax, %rcx + sbbq $0x0, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + movq $0xffffffff00000001, %rax + mulxq %rax, %rax, %rcx + sbbq %rax, %r11 + sbbq %rcx, %rdx + decq %rdx + movl $0xffffffff, %eax + andq %rdx, %rax + xorl %ecx, %ecx + subq %rax, %rcx + addq %rdx, %r8 + movq %r8, (%rdi) + adcq %rax, %r9 + movq %r9, 0x8(%rdi) + adcq $0x0, %r10 + movq %r10, 0x10(%rdi) + adcq %rcx, %r11 + movq %r11, 0x18(%rdi) + movq $0xffffffffffffffff, %r8 + xorl %r10d, %r10d + subq (%rsp), %r8 + movq $0xffffffff, %r9 + sbbq 0x8(%rsp), %r9 + sbbq 0x10(%rsp), %r10 + movq $0xffffffff00000001, %r11 + sbbq 0x18(%rsp), %r11 + movq %r11, %r12 + shldq $0x3, %r10, %r11 + shldq $0x3, %r9, %r10 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + shrq $0x3d, %r12 + movq $0x3, %rdx + xorl %eax, %eax + mulxq 0x60(%rsp), %rax, %rcx + adcxq %rax, %r8 + adoxq %rcx, %r9 + mulxq 0x68(%rsp), %rax, %rcx + adcxq %rax, %r9 + adoxq %rcx, %r10 + mulxq 0x70(%rsp), %rax, %rcx + adcxq %rax, %r10 + adoxq %rcx, %r11 + mulxq 0x78(%rsp), %rax, %rdx + adcxq %rax, %r11 + adoxq %r12, %rdx + adcq $0x1, %rdx + addq %rdx, %r8 + movq $0x100000000, %rax + mulxq %rax, %rax, %rcx + sbbq $0x0, %rax + sbbq $0x0, %rcx + subq %rax, %r9 + sbbq %rcx, %r10 + movq $0xffffffff00000001, %rax + mulxq %rax, %rax, %rcx + sbbq %rax, %r11 + sbbq %rcx, %rdx + decq %rdx + movl $0xffffffff, %eax + andq %rdx, %rax + xorl %ecx, %ecx + subq %rax, %rcx + addq %rdx, %r8 + movq %r8, 0x20(%rdi) + adcq %rax, %r9 + movq %r9, 0x28(%rdi) + adcq $0x0, %r10 + movq %r10, 0x30(%rdi) + adcq %rcx, %r11 + movq %r11, 0x38(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S new file mode 100644 index 0000000000..51d55eee86 --- /dev/null +++ b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S @@ -0,0 +1,4707 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + +// ---------------------------------------------------------------------------- +// Montgomery-Jacobian form scalar multiplication for P-256 +// Input scalar[4], point[12]; output res[12] +// +// extern void p256_montjscalarmul_alt +// (uint64_t res[static 12], +// uint64_t scalar[static 4], +// uint64_t point[static 12]); +// +// This function is a variant of its affine point version p256_scalarmul. +// Here, input and output points are assumed to be in Jacobian form with +// their coordinates in the Montgomery domain. Thus, if priming indicates +// Montgomery form, x' = (2^256 * x) mod p_256 etc., each point argument +// is a triple (x',y',z') representing the affine point (x/z^2,y/z^3) when +// z' is nonzero or the point at infinity (group identity) if z' = 0. +// +// Given scalar = n and point = P, assumed to be on the NIST elliptic +// curve P-256, returns a representation of n * P. If the result is the +// point at infinity (either because the input point was or because the +// scalar was a multiple of p_256) then the output is guaranteed to +// represent the point at infinity, i.e. to have its z coordinate zero. +// +// Standard x86-64 ABI: RDI = res, RSI = scalar, RDX = point +// Microsoft x64 ABI: RCX = res, RDX = scalar, R8 = point +// ---------------------------------------------------------------------------- + +#include "_internal_s2n_bignum.h" + + + S2N_BN_SYM_VISIBILITY_DIRECTIVE(p256_montjscalarmul_alt) + S2N_BN_SYM_PRIVACY_DIRECTIVE(p256_montjscalarmul_alt) + + .text + .balign 4 + +// Size of individual field elements + +#define NUMSIZE 32 + +// Intermediate variables on the stack. Uppercase syntactic variants +// make x86_att version simpler to generate. + +#define SCALARB (0*NUMSIZE) +#define scalarb (0*NUMSIZE)(%rsp) +#define ACC (1*NUMSIZE) +#define acc (1*NUMSIZE)(%rsp) +#define TABENT (4*NUMSIZE) +#define tabent (4*NUMSIZE)(%rsp) + +#define TAB (7*NUMSIZE) +#define tab (7*NUMSIZE)(%rsp) + +#define res (31*NUMSIZE)(%rsp) + +#define NSPACE (32*NUMSIZE) + +S2N_BN_SYMBOL(p256_montjscalarmul_alt): + +// The Windows version literally calls the standard ABI version. +// This simplifies the proofs since subroutine offsets are fixed. + +#if WINDOWS_ABI + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + callq p256_montjscalarmul_alt_standard + popq %rsi + popq %rdi + ret + +p256_montjscalarmul_alt_standard: +#endif + +// Real start of the standard ABI code. + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + + subq $NSPACE, %rsp + +// Preserve the "res" and "point" input arguments. We load and process the +// scalar immediately so we don't bother preserving that input argument. +// Also, "point" is only needed early on and so its register gets re-used. + + movq %rdx, %rbx + movq %rdi, res + +// Load the digits of group order n_256 = [%r15;%r14;%r13;%r12] + + movq $0xf3b9cac2fc632551, %r12 + movq $0xbce6faada7179e84, %r13 + movq $0xffffffffffffffff, %r14 + movq $0xffffffff00000000, %r15 + +// First, reduce the input scalar mod n_256, i.e. conditionally subtract n_256 + + movq (%rsi), %r8 + subq %r12, %r8 + movq 8(%rsi), %r9 + sbbq %r13, %r9 + movq 16(%rsi), %r10 + sbbq %r14, %r10 + movq 24(%rsi), %r11 + sbbq %r15, %r11 + + cmovcq (%rsi), %r8 + cmovcq 8(%rsi), %r9 + cmovcq 16(%rsi), %r10 + cmovcq 24(%rsi), %r11 + +// Now if the top bit of the reduced scalar is set, negate it mod n_256, +// i.e. do n |-> n_256 - n. Remember the sign in %rbp so we can +// correspondingly negate the point below. + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + movq %r11, %rbp + shrq $63, %rbp + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + +// In either case then add the recoding constant 0x08888...888 to allow +// signed digits. + + movq $0x8888888888888888, %rax + addq %rax, %r8 + adcq %rax, %r9 + adcq %rax, %r10 + adcq %rax, %r11 + btc $63, %r11 + + movq %r8, SCALARB(%rsp) + movq %r9, SCALARB+8(%rsp) + movq %r10, SCALARB+16(%rsp) + movq %r11, SCALARB+24(%rsp) + +// Set the tab[0] table entry to the input point = 1 * P, except +// that we negate it if the top bit of the scalar was set. This +// negation takes care over the y = 0 case to maintain all the +// coordinates < p_256 throughout, even though triples (x,y,z) +// with y = 0 can only represent a point on the curve when z = 0 +// and it represents the point at infinity regardless of x and y. + + movq (%rbx), %rax + movq %rax, TAB(%rsp) + movq 8(%rbx), %rax + movq %rax, TAB+8(%rsp) + movq 16(%rbx), %rax + movq %rax, TAB+16(%rsp) + movq 24(%rbx), %rax + movq %rax, TAB+24(%rsp) + + movq 32(%rbx), %r12 + movq %r12, %rax + movq 40(%rbx), %r13 + orq %r13, %rax + movq 48(%rbx), %r14 + movq %r14, %rcx + movq 56(%rbx), %r15 + orq %r15, %rcx + orq %rcx, %rax + cmovzq %rax, %rbp + + xorl %r10d, %r10d + leaq -1(%r10), %r8 + movq $0x00000000ffffffff, %r11 + movq %r11, %r9 + negq %r11 + subq %r12, %r8 + sbbq %r13, %r9 + sbbq %r14, %r10 + sbbq %r15, %r11 + testq %rbp, %rbp + cmovzq %r12, %r8 + cmovzq %r13, %r9 + cmovzq %r14, %r10 + cmovzq %r15, %r11 + movq %r8, TAB+32(%rsp) + movq %r9, TAB+40(%rsp) + movq %r10, TAB+48(%rsp) + movq %r11, TAB+56(%rsp) + + movq 64(%rbx), %rax + movq %rax, TAB+64(%rsp) + movq 72(%rbx), %rax + movq %rax, TAB+72(%rsp) + movq 80(%rbx), %rax + movq %rax, TAB+80(%rsp) + movq 88(%rbx), %rax + movq %rax, TAB+88(%rsp) + +// Compute and record tab[1] = 2 * p, ..., tab[7] = 8 * P + + leaq TAB+96*1(%rsp), %rdi + leaq TAB(%rsp), %rsi + callq local_p256_montjdouble + + leaq TAB+96*2(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq local_p256_montjadd + + leaq TAB+96*3(%rsp), %rdi + leaq TAB+96*1(%rsp), %rsi + callq local_p256_montjdouble + + leaq TAB+96*4(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq local_p256_montjadd + + leaq TAB+96*5(%rsp), %rdi + leaq TAB+96*2(%rsp), %rsi + callq local_p256_montjdouble + + leaq TAB+96*6(%rsp), %rdi + leaq TAB+96*5(%rsp), %rsi + leaq TAB(%rsp), %rdx + callq local_p256_montjadd + + leaq TAB+96*7(%rsp), %rdi + leaq TAB+96*3(%rsp), %rsi + callq local_p256_montjdouble + +// Set up accumulator as table entry for top 4 bits (constant-time indexing) + + movq SCALARB+24(%rsp), %rdi + shrq $60, %rdi + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + movq %rax, ACC(%rsp) + movq %rbx, ACC+8(%rsp) + movq %rcx, ACC+16(%rsp) + movq %rdx, ACC+24(%rsp) + movq %r8, ACC+32(%rsp) + movq %r9, ACC+40(%rsp) + movq %r10, ACC+48(%rsp) + movq %r11, ACC+56(%rsp) + movq %r12, ACC+64(%rsp) + movq %r13, ACC+72(%rsp) + movq %r14, ACC+80(%rsp) + movq %r15, ACC+88(%rsp) + +// Main loop over size-4 bitfield + + movl $252, %ebp + +loop: + subq $4, %rbp + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjdouble + + movq %rbp, %rax + shrq $6, %rax + movq (%rsp,%rax,8), %rdi + movq %rbp, %rcx + shrq %cl, %rdi + andq $15, %rdi + + subq $8, %rdi + sbbq %rsi, %rsi // %rsi = sign of digit (-1 = negative) + xorq %rsi, %rdi + subq %rsi, %rdi // %rdi = absolute value of digit + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %r8d, %r8d + xorl %r9d, %r9d + xorl %r10d, %r10d + xorl %r11d, %r11d + xorl %r12d, %r12d + xorl %r13d, %r13d + xorl %r14d, %r14d + xorl %r15d, %r15d + + .set I, 1 +.rep 8 + cmpq $I, %rdi + + cmovzq TAB+96*(I-1)(%rsp), %rax + cmovzq TAB+96*(I-1)+8(%rsp), %rbx + cmovzq TAB+96*(I-1)+16(%rsp), %rcx + cmovzq TAB+96*(I-1)+24(%rsp), %rdx + cmovzq TAB+96*(I-1)+32(%rsp), %r8 + cmovzq TAB+96*(I-1)+40(%rsp), %r9 + cmovzq TAB+96*(I-1)+48(%rsp), %r10 + cmovzq TAB+96*(I-1)+56(%rsp), %r11 + cmovzq TAB+96*(I-1)+64(%rsp), %r12 + cmovzq TAB+96*(I-1)+72(%rsp), %r13 + cmovzq TAB+96*(I-1)+80(%rsp), %r14 + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + .set I, (I+1) +.endr + +// Store it to "tabent" with the y coordinate optionally negated +// Again, do it carefully to give coordinates < p_256 even in +// the degenerate case y = 0 (when z = 0 for points on the curve). + + movq %rax, TABENT(%rsp) + movq %rbx, TABENT+8(%rsp) + movq %rcx, TABENT+16(%rsp) + movq %rdx, TABENT+24(%rsp) + + movq %r12, TABENT+64(%rsp) + movq %r13, TABENT+72(%rsp) + movq %r14, TABENT+80(%rsp) + movq %r15, TABENT+88(%rsp) + + movq %r8, %rax + xorl %r14d, %r14d + orq %r9, %rax + leaq -1(%r14), %r12 + movq %r10, %rcx + movq $0x00000000ffffffff, %r15 + orq %r11, %rcx + movq %r15, %r13 + negq %r15 + orq %rcx, %rax + cmovzq %rax, %rsi + + subq %r8, %r12 + sbbq %r9, %r13 + sbbq %r10, %r14 + sbbq %r11, %r15 + + testq %rsi, %rsi + cmovnzq %r12, %r8 + cmovnzq %r13, %r9 + cmovnzq %r14, %r10 + cmovnzq %r15, %r11 + + movq %r8, TABENT+32(%rsp) + movq %r9, TABENT+40(%rsp) + movq %r10, TABENT+48(%rsp) + movq %r11, TABENT+56(%rsp) + + leaq TABENT(%rsp), %rdx + leaq ACC(%rsp), %rsi + leaq ACC(%rsp), %rdi + callq local_p256_montjadd + + testq %rbp, %rbp + jne loop + +// That's the end of the main loop, and we just need to copy the +// result in "acc" to the output. + + movq res, %rdi + movq ACC(%rsp), %rax + movq %rax, (%rdi) + movq ACC+8(%rsp), %rax + movq %rax, 8(%rdi) + movq ACC+16(%rsp), %rax + movq %rax, 16(%rdi) + movq ACC+24(%rsp), %rax + movq %rax, 24(%rdi) + + movq ACC+32(%rsp), %rax + movq %rax, 32(%rdi) + movq ACC+40(%rsp), %rax + movq %rax, 40(%rdi) + movq ACC+48(%rsp), %rax + movq %rax, 48(%rdi) + movq ACC+56(%rsp), %rax + movq %rax, 56(%rdi) + + movq ACC+64(%rsp), %rax + movq %rax, 64(%rdi) + movq ACC+72(%rsp), %rax + movq %rax, 72(%rdi) + movq ACC+80(%rsp), %rax + movq %rax, 80(%rdi) + movq ACC+88(%rsp), %rax + movq %rax, 88(%rdi) + +// Restore stack and registers and return + + addq $NSPACE, %rsp + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + ret + +// Local copies of subroutines, complete clones at the moment + +local_p256_montjadd: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xe0, %rsp + movq %rdx, %rbp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x40(%rbp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rbp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rbp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rbp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rbp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rbp), %rbx + movq 0x48(%rbp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rbp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rbp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rbp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x20(%rsi), %rbx + movq 0x40(%rbp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rbp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rbp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rbp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsi), %rbx + xorl %r13d, %r13d + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rbp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rbp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rbp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsi), %rbx + xorl %r15d, %r15d + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rbp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rbp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rbp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsi), %rbx + xorl %r8d, %r8d + movq 0x40(%rbp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rbp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rbp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rbp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x20(%rbp), %rbx + movq 0x40(%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rbp), %rbx + xorl %r13d, %r13d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rbp), %rbx + xorl %r15d, %r15d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rbp), %rbx + xorl %r8d, %r8d + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0x0(%rbp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rbp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rbp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rbp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsi), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x8(%rsi), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x10(%rsi), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x18(%rsi), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x20(%rsp), %rbx + movq (%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsp), %rbx + xorl %r13d, %r13d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsp), %rbx + xorl %r15d, %r15d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsp), %rbx + xorl %r8d, %r8d + movq (%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq 0xc0(%rsp), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0xc8(%rsp), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xd0(%rsp), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xd8(%rsp), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xc0(%rsp) + movq %r13, 0xc8(%rsp) + movq %r14, 0xd0(%rsp) + movq %r15, 0xd8(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0xa0(%rsp) + adcq %r10, %rcx + movq %rcx, 0xa8(%rsp) + adcq $0x0, %r8 + movq %r8, 0xb0(%rsp) + adcq %rdx, %r9 + movq %r9, 0xb8(%rsp) + movq 0x20(%rsp), %rax + subq 0xc0(%rsp), %rax + movq 0x28(%rsp), %rcx + sbbq 0xc8(%rsp), %rcx + movq 0x30(%rsp), %r8 + sbbq 0xd0(%rsp), %r8 + movq 0x38(%rsp), %r9 + sbbq 0xd8(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x20(%rsp) + adcq %r10, %rcx + movq %rcx, 0x28(%rsp) + adcq $0x0, %r8 + movq %r8, 0x30(%rsp) + adcq %rdx, %r9 + movq %r9, 0x38(%rsp) + movq 0xa0(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0xa8(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0xb8(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0xb0(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0xb8(%rsp), %rbx + movq 0xa8(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0xa8(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0xb0(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0xb8(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x80(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x88(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x90(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x98(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x40(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq (%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x40(%rsp), %rax + subq 0x80(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x88(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x90(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x98(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq 0x40(%rsi), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rsi), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rsi), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rsi), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq (%rsp), %rax + subq 0x40(%rsp), %rax + movq 0x8(%rsp), %rcx + sbbq 0x48(%rsp), %rcx + movq 0x10(%rsp), %r8 + sbbq 0x50(%rsp), %r8 + movq 0x18(%rsp), %r9 + sbbq 0x58(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, (%rsp) + adcq %r10, %rcx + movq %rcx, 0x8(%rsp) + adcq $0x0, %r8 + movq %r8, 0x10(%rsp) + adcq %rdx, %r9 + movq %r9, 0x18(%rsp) + movq 0x80(%rsp), %rax + subq (%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0xc0(%rsp), %rbx + movq 0x60(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x70(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x78(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0xc8(%rsp), %rbx + xorl %r13d, %r13d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0xd0(%rsp), %rbx + xorl %r15d, %r15d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0xd8(%rsp), %rbx + xorl %r8d, %r8d + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x68(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x70(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x78(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rbp), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x48(%rbp), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x50(%rbp), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x58(%rbp), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x80(%rsp), %rbx + movq 0x20(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x30(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x38(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x88(%rsp), %rbx + xorl %r13d, %r13d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x90(%rsp), %rbx + xorl %r15d, %r15d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x98(%rsp), %rbx + xorl %r8d, %r8d + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x28(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x30(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x38(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x80(%rsp), %rax + subq 0x60(%rsp), %rax + movq 0x88(%rsp), %rcx + sbbq 0x68(%rsp), %rcx + movq 0x90(%rsp), %r8 + sbbq 0x70(%rsp), %r8 + movq 0x98(%rsp), %r9 + sbbq 0x78(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x80(%rsp) + adcq %r10, %rcx + movq %rcx, 0x88(%rsp) + adcq $0x0, %r8 + movq %r8, 0x90(%rsp) + adcq %rdx, %r9 + movq %r9, 0x98(%rsp) + movq 0x40(%rsi), %r8 + movq 0x48(%rsi), %r9 + movq 0x50(%rsi), %r10 + movq 0x58(%rsi), %r11 + movq %r8, %rax + movq %r9, %rdx + orq %r10, %rax + orq %r11, %rdx + orq %rdx, %rax + negq %rax + sbbq %rax, %rax + movq 0x40(%rbp), %r12 + movq 0x48(%rbp), %r13 + movq 0x50(%rbp), %r14 + movq 0x58(%rbp), %r15 + movq %r12, %rbx + movq %r13, %rdx + orq %r14, %rbx + orq %r15, %rdx + orq %rdx, %rbx + negq %rbx + sbbq %rbx, %rbx + cmpq %rax, %rbx + cmovbq %r8, %r12 + cmovbq %r9, %r13 + cmovbq %r10, %r14 + cmovbq %r11, %r15 + cmoveq 0xa0(%rsp), %r12 + cmoveq 0xa8(%rsp), %r13 + cmoveq 0xb0(%rsp), %r14 + cmoveq 0xb8(%rsp), %r15 + movq (%rsp), %rax + cmovbq (%rsi), %rax + cmova 0x0(%rbp), %rax + movq 0x8(%rsp), %rbx + cmovbq 0x8(%rsi), %rbx + cmova 0x8(%rbp), %rbx + movq 0x10(%rsp), %rcx + cmovbq 0x10(%rsi), %rcx + cmova 0x10(%rbp), %rcx + movq 0x18(%rsp), %rdx + cmovbq 0x18(%rsi), %rdx + cmova 0x18(%rbp), %rdx + movq 0x80(%rsp), %r8 + cmovbq 0x20(%rsi), %r8 + cmova 0x20(%rbp), %r8 + movq 0x88(%rsp), %r9 + cmovbq 0x28(%rsi), %r9 + cmova 0x28(%rbp), %r9 + movq 0x90(%rsp), %r10 + cmovbq 0x30(%rsi), %r10 + cmova 0x30(%rbp), %r10 + movq 0x98(%rsp), %r11 + cmovbq 0x38(%rsi), %r11 + cmova 0x38(%rbp), %r11 + movq %rax, (%rdi) + movq %rbx, 0x8(%rdi) + movq %rcx, 0x10(%rdi) + movq %rdx, 0x18(%rdi) + movq %r8, 0x20(%rdi) + movq %r9, 0x28(%rdi) + movq %r10, 0x30(%rdi) + movq %r11, 0x38(%rdi) + movq %r12, 0x40(%rdi) + movq %r13, 0x48(%rdi) + movq %r14, 0x50(%rdi) + movq %r15, 0x58(%rdi) + addq $0xe0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + ret + +local_p256_montjdouble: + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $0xc0, %rsp + movq 0x40(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsi), %rbx + movq 0x48(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x20(%rsi), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsi), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsi), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsi), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsi), %rbx + movq 0x28(%rsi), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsi), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsi), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x20(%rsp) + movq %r13, 0x28(%rsp) + movq %r14, 0x30(%rsp) + movq %r15, 0x38(%rsp) + movq (%rsi), %rax + subq (%rsp), %rax + movq 0x8(%rsi), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x60(%rsp) + adcq %r10, %rcx + movq %rcx, 0x68(%rsp) + adcq $0x0, %r8 + movq %r8, 0x70(%rsp) + adcq %rdx, %r9 + movq %r9, 0x78(%rsp) + movq (%rsi), %rax + addq (%rsp), %rax + movq 0x8(%rsi), %rcx + adcq 0x8(%rsp), %rcx + movq 0x10(%rsi), %r8 + adcq 0x10(%rsp), %r8 + movq 0x18(%rsi), %r9 + adcq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + subq %r11, %rax + movq %rax, 0x40(%rsp) + sbbq %r10, %rcx + movq %rcx, 0x48(%rsp) + sbbq $0x0, %r8 + movq %r8, 0x50(%rsp) + sbbq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x60(%rsp), %rbx + movq 0x40(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x48(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x50(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x58(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x68(%rsp), %rbx + xorl %r13d, %r13d + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x48(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x50(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x58(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x70(%rsp), %rbx + xorl %r15d, %r15d + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x48(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x50(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x58(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x78(%rsp), %rbx + xorl %r8d, %r8d + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x48(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x50(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x58(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + xorq %r11, %r11 + movq 0x20(%rsi), %rax + addq 0x40(%rsi), %rax + movq 0x28(%rsi), %rcx + adcq 0x48(%rsi), %rcx + movq 0x30(%rsi), %r8 + adcq 0x50(%rsi), %r8 + movq 0x38(%rsi), %r9 + adcq 0x58(%rsi), %r9 + adcq %r11, %r11 + subq $0xffffffffffffffff, %rax + movl $0xffffffff, %r10d + sbbq %r10, %rcx + sbbq $0x0, %r8 + movabsq $0xffffffff00000001, %rdx + sbbq %rdx, %r9 + sbbq $0x0, %r11 + andq %r11, %r10 + andq %r11, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rbx + movq (%rsi), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0x10(%rsi), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0x18(%rsi), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x28(%rsp), %rbx + xorl %r13d, %r13d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x30(%rsp), %rbx + xorl %r15d, %r15d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x38(%rsp), %rbx + xorl %r8d, %r8d + movq (%rsi), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0x8(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0x10(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0x18(%rsi), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x80(%rsp) + movq %r13, 0x88(%rsp) + movq %r14, 0x90(%rsp) + movq %r15, 0x98(%rsp) + movq 0x60(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x68(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x78(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x70(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x60(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x68(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x78(%rsp), %rbx + movq 0x68(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x68(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x70(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x78(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0xa0(%rsp) + movq %r13, 0xa8(%rsp) + movq %r14, 0xb0(%rsp) + movq %r15, 0xb8(%rsp) + movq 0x40(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x48(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x58(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x50(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x40(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x48(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x58(%rsp), %rbx + movq 0x48(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x48(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x50(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x58(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x40(%rsp) + movq %r13, 0x48(%rsp) + movq %r14, 0x50(%rsp) + movq %r15, 0x58(%rsp) + movq $0xffffffffffffffff, %r9 + xorl %r11d, %r11d + subq 0xa0(%rsp), %r9 + movabsq $0xffffffff, %r10 + sbbq 0xa8(%rsp), %r10 + sbbq 0xb0(%rsp), %r11 + movabsq $0xffffffff00000001, %r12 + sbbq 0xb8(%rsp), %r12 + movq $0x9, %rcx + movq %r9, %rax + mulq %rcx + movq %rax, %r8 + movq %rdx, %r9 + movq %r10, %rax + xorl %r10d, %r10d + mulq %rcx + addq %rax, %r9 + adcq %rdx, %r10 + movq %r11, %rax + xorl %r11d, %r11d + mulq %rcx + addq %rax, %r10 + adcq %rdx, %r11 + movq %r12, %rax + xorl %r12d, %r12d + mulq %rcx + addq %rax, %r11 + adcq %rdx, %r12 + movl $0xc, %ecx + movq 0x80(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbx, %rbx + movq 0x88(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbx, %rbx + movq 0x90(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbx, %rbx + movq 0x98(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + leaq 0x1(%r12), %rcx + movabsq $0xffffffff00000001, %rax + mulq %rcx + movq %rcx, %rbx + shlq $0x20, %rbx + addq %rcx, %r8 + sbbq $0x0, %rbx + subq %rbx, %r9 + sbbq $0x0, %r10 + sbbq %rax, %r11 + sbbq %rdx, %rcx + decq %rcx + movl $0xffffffff, %eax + andq %rcx, %rax + xorl %edx, %edx + subq %rax, %rdx + addq %rcx, %r8 + movq %r8, 0xa0(%rsp) + adcq %rax, %r9 + movq %r9, 0xa8(%rsp) + adcq $0x0, %r10 + movq %r10, 0xb0(%rsp) + adcq %rdx, %r11 + movq %r11, 0xb8(%rsp) + movq 0x40(%rsp), %rax + subq (%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x8(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x10(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x18(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rsp) + adcq %r10, %rcx + movq %rcx, 0x48(%rsp) + adcq $0x0, %r8 + movq %r8, 0x50(%rsp) + adcq %rdx, %r9 + movq %r9, 0x58(%rsp) + movq 0x20(%rsp), %rax + movq %rax, %rbx + mulq %rax + movq %rax, %r8 + movq %rdx, %r15 + movq 0x28(%rsp), %rax + mulq %rbx + movq %rax, %r9 + movq %rdx, %r10 + movq 0x38(%rsp), %rax + movq %rax, %r13 + mulq %rbx + movq %rax, %r11 + movq %rdx, %r12 + movq 0x30(%rsp), %rax + movq %rax, %rbx + mulq %r13 + movq %rax, %r13 + movq %rdx, %r14 + movq 0x20(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq 0x38(%rsp), %rbx + movq 0x28(%rsp), %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq $0x0, %r14 + xorl %ecx, %ecx + addq %r9, %r9 + adcq %r10, %r10 + adcq %r11, %r11 + adcq %r12, %r12 + adcq %r13, %r13 + adcq %r14, %r14 + adcq %rcx, %rcx + movq 0x28(%rsp), %rax + mulq %rax + addq %r15, %r9 + adcq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + movq 0x30(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r12 + adcq %rdx, %r13 + sbbq %r15, %r15 + movq 0x38(%rsp), %rax + mulq %rax + negq %r15 + adcq %rax, %r14 + adcq %rcx, %rdx + movq %rdx, %r15 + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rcx, %rcx + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + xorl %r8d, %r8d + movq %r9, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r8, %r14 + adcq %r8, %r15 + adcq %r8, %r8 + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + xorl %r9d, %r9d + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + leaq -0x1(%rbx), %rbx + adcq %r13, %rbx + leaq -0x1(%r9), %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, (%rsp) + movq %r13, 0x8(%rsp) + movq %r14, 0x10(%rsp) + movq %r15, 0x18(%rsp) + movq 0x60(%rsp), %rbx + movq 0xa0(%rsp), %rax + mulq %rbx + movq %rax, %r8 + movq %rdx, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + xorl %r10d, %r10d + addq %rax, %r9 + adcq %rdx, %r10 + movq 0xb0(%rsp), %rax + mulq %rbx + xorl %r11d, %r11d + addq %rax, %r10 + adcq %rdx, %r11 + movq 0xb8(%rsp), %rax + mulq %rbx + xorl %r12d, %r12d + addq %rax, %r11 + adcq %rdx, %r12 + movq 0x68(%rsp), %rbx + xorl %r13d, %r13d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r14, %r14 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r14, %r14 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r14, %r14 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r14, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + xorl %r14d, %r14d + movabsq $0x100000000, %rbx + movq %r8, %rax + mulq %rbx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r15, %r15 + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r8, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r15, %r15 + movq %r9, %rax + mulq %rbx + subq %r15, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + adcq %r14, %r14 + movq 0x70(%rsp), %rbx + xorl %r15d, %r15d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %r8, %r8 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r8, %r8 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r8, %r8 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r8, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + adcq %r15, %r15 + movq 0x78(%rsp), %rbx + xorl %r8d, %r8d + movq 0xa0(%rsp), %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %r9, %r9 + movq 0xa8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %r9, %r9 + movq 0xb0(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %r9, %r9 + movq 0xb8(%rsp), %rax + mulq %rbx + subq %r9, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r8, %r8 + xorl %r9d, %r9d + movabsq $0x100000000, %rbx + movq %r10, %rax + mulq %rbx + addq %rax, %r11 + adcq %rdx, %r12 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r12 + adcq %rdx, %r13 + sbbq %rcx, %rcx + notq %rbx + leaq 0x2(%rbx), %rbx + movq %r10, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r13 + adcq %rdx, %r14 + sbbq %rcx, %rcx + movq %r11, %rax + mulq %rbx + subq %rcx, %rdx + addq %rax, %r14 + adcq %rdx, %r15 + adcq %r9, %r8 + movl $0x1, %ecx + addq %r12, %rcx + decq %rbx + adcq %r13, %rbx + decq %r9 + movq %r9, %rax + adcq %r14, %r9 + movl $0xfffffffe, %r11d + adcq %r15, %r11 + adcq %r8, %rax + cmovbq %rcx, %r12 + cmovbq %rbx, %r13 + cmovbq %r9, %r14 + cmovbq %r11, %r15 + movq %r12, 0x60(%rsp) + movq %r13, 0x68(%rsp) + movq %r14, 0x70(%rsp) + movq %r15, 0x78(%rsp) + movq 0x40(%rsp), %rax + subq 0x20(%rsp), %rax + movq 0x48(%rsp), %rcx + sbbq 0x28(%rsp), %rcx + movq 0x50(%rsp), %r8 + sbbq 0x30(%rsp), %r8 + movq 0x58(%rsp), %r9 + sbbq 0x38(%rsp), %r9 + movl $0xffffffff, %r10d + sbbq %r11, %r11 + xorq %rdx, %rdx + andq %r11, %r10 + subq %r10, %rdx + addq %r11, %rax + movq %rax, 0x40(%rdi) + adcq %r10, %rcx + movq %rcx, 0x48(%rdi) + adcq $0x0, %r8 + movq %r8, 0x50(%rdi) + adcq %rdx, %r9 + movq %r9, 0x58(%rdi) + movq 0x98(%rsp), %r11 + movq %r11, %rcx + movq 0x90(%rsp), %r10 + shldq $0x2, %r10, %r11 + movq 0x88(%rsp), %r9 + shldq $0x2, %r9, %r10 + movq 0x80(%rsp), %r8 + shldq $0x2, %r8, %r9 + shlq $0x2, %r8 + shrq $0x3e, %rcx + addq $0x1, %rcx + subq 0xa0(%rsp), %r8 + sbbq 0xa8(%rsp), %r9 + sbbq 0xb0(%rsp), %r10 + sbbq 0xb8(%rsp), %r11 + sbbq $0x0, %rcx + movabsq $0xffffffff00000001, %rax + mulq %rcx + movq %rcx, %rbx + shlq $0x20, %rbx + addq %rcx, %r8 + sbbq $0x0, %rbx + subq %rbx, %r9 + sbbq $0x0, %r10 + sbbq %rax, %r11 + sbbq %rdx, %rcx + decq %rcx + movl $0xffffffff, %eax + andq %rcx, %rax + xorl %edx, %edx + subq %rax, %rdx + addq %rcx, %r8 + movq %r8, (%rdi) + adcq %rax, %r9 + movq %r9, 0x8(%rdi) + adcq $0x0, %r10 + movq %r10, 0x10(%rdi) + adcq %rdx, %r11 + movq %r11, 0x18(%rdi) + movq $0xffffffffffffffff, %r8 + xorl %r10d, %r10d + subq (%rsp), %r8 + movabsq $0xffffffff, %r9 + sbbq 0x8(%rsp), %r9 + sbbq 0x10(%rsp), %r10 + movabsq $0xffffffff00000001, %r11 + sbbq 0x18(%rsp), %r11 + movq %r11, %r12 + shldq $0x3, %r10, %r11 + shldq $0x3, %r9, %r10 + shldq $0x3, %r8, %r9 + shlq $0x3, %r8 + shrq $0x3d, %r12 + movl $0x3, %ecx + movq 0x60(%rsp), %rax + mulq %rcx + addq %rax, %r8 + adcq %rdx, %r9 + sbbq %rbx, %rbx + movq 0x68(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r9 + adcq %rdx, %r10 + sbbq %rbx, %rbx + movq 0x70(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r10 + adcq %rdx, %r11 + sbbq %rbx, %rbx + movq 0x78(%rsp), %rax + mulq %rcx + subq %rbx, %rdx + addq %rax, %r11 + adcq %rdx, %r12 + leaq 0x1(%r12), %rcx + movabsq $0xffffffff00000001, %rax + mulq %rcx + movq %rcx, %rbx + shlq $0x20, %rbx + addq %rcx, %r8 + sbbq $0x0, %rbx + subq %rbx, %r9 + sbbq $0x0, %r10 + sbbq %rax, %r11 + sbbq %rdx, %rcx + decq %rcx + movl $0xffffffff, %eax + andq %rcx, %rax + xorl %edx, %edx + subq %rax, %rdx + addq %rcx, %r8 + movq %r8, 0x20(%rdi) + adcq %rax, %r9 + movq %r9, 0x28(%rdi) + adcq $0x0, %r10 + movq %r10, 0x30(%rdi) + adcq %rdx, %r11 + movq %r11, 0x38(%rdi) + addq $0xc0, %rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack, "", %progbits +#endif From 537d8105887a1f91921058db89717838731434a1 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 22 Aug 2024 22:26:00 -0700 Subject: [PATCH 2/4] Use P-256 scalar multiplication from s2n-bignum where applicable This replaces the general (fresh, not precomputed, point) scalar multiplication with the corresponding function p256_montjscalarmul or p256_montjscalarmul_alt from s2n-bignum. --- crypto/fipsmodule/ec/p256-nistz.c | 56 ++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/crypto/fipsmodule/ec/p256-nistz.c b/crypto/fipsmodule/ec/p256-nistz.c index 2c8a8cb40a..ccb9db9aac 100644 --- a/crypto/fipsmodule/ec/p256-nistz.c +++ b/crypto/fipsmodule/ec/p256-nistz.c @@ -51,19 +51,6 @@ static const BN_ULONG ONE[P256_LIMBS] = { // Precomputed tables for the default generator #include "p256-nistz-table.h" -// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in -// util.c for details -static crypto_word_t booth_recode_w5(crypto_word_t in) { - crypto_word_t s, d; - - s = ~((in >> 5) - 1); - d = (1 << 6) - in - 1; - d = (d & s) | (in & ~s); - d = (d >> 1) + (d & 1); - - return (d << 1) + (s & 1); -} - static crypto_word_t booth_recode_w7(crypto_word_t in) { crypto_word_t s, d; @@ -203,6 +190,46 @@ static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS], // r = p * p_scalar + +#if defined(EC_P256_USE_S2N_BIGNUM) + +static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r, + const EC_JACOBIAN *p, + const EC_SCALAR *p_scalar) { + uint64_t s2n_point[12], s2n_result[12]; + + assert(p != NULL); + assert(p_scalar != NULL); + assert(group->field.N.width == P256_LIMBS); + + OPENSSL_memcpy(s2n_point,p->X.words,32); + OPENSSL_memcpy(s2n_point+4,p->Y.words,32); + OPENSSL_memcpy(s2n_point+8,p->Z.words,32); + + p256_montjscalarmul_selector(s2n_result,(uint64_t*)p_scalar,s2n_point); + + OPENSSL_memcpy(r->X,s2n_result,32); + OPENSSL_memcpy(r->Y,s2n_result+4,32); + OPENSSL_memcpy(r->Z,s2n_result+8,32); +} + +#else + +// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in +// util.c for details +static crypto_word_t booth_recode_w5(crypto_word_t in) { + crypto_word_t s, d; + + s = ~((in >> 5) - 1); + d = (1 << 6) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + return (d << 1) + (s & 1); +} + +// r = p * p_scalar + static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r, const EC_JACOBIAN *p, const EC_SCALAR *p_scalar) { @@ -296,6 +323,9 @@ static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r, ecp_nistz256_point_add(r, r, aligned_h); } +#endif + + static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) { static const size_t kWindowSize = 7; static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; From 5b77e2fb04e07fc052346e5759fce6ac7d576b3d Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 23 Aug 2024 12:52:09 -0700 Subject: [PATCH 3/4] Make the new P-256 functions from s2n-bignum delocator-compatible This should make FIPS builds work. --- .../s2n-bignum/arm/p256/bignum_montinv_p256.S | 8 +- .../s2n-bignum/arm/p256/p256_montjscalarmul.S | 123 +++++++++--------- .../arm/p256/p256_montjscalarmul_alt.S | 123 +++++++++--------- .../x86_att/p256/bignum_montinv_p256.S | 8 +- .../x86_att/p256/p256_montjscalarmul.S | 103 ++++++++------- .../x86_att/p256/p256_montjscalarmul_alt.S | 101 +++++++------- 6 files changed, 227 insertions(+), 239 deletions(-) diff --git a/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S index 059f77e9af..1a5a7a0ffc 100644 --- a/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S +++ b/third_party/s2n-bignum/arm/p256/bignum_montinv_p256.S @@ -820,9 +820,9 @@ S2N_BN_SYMBOL(bignum_montinv_p256): mov i, #10 mov d, #1 - b midloop + b bignum_montinv_p256_midloop -loop: +bignum_montinv_p256_loop: // Separate the matrix elements into sign-magnitude pairs @@ -1137,7 +1137,7 @@ loop: stp x1, x3, [v] stp x2, x5, [v+16] -midloop: +bignum_montinv_p256_midloop: mov x1, d ldr x2, [f] @@ -1148,7 +1148,7 @@ midloop: // Next iteration subs i, i, #1 - bne loop + bne bignum_montinv_p256_loop // The 10th and last iteration does not need anything except the // u value and the sign of f; the latter can be obtained from the diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S index 23bc20971e..4c7f21a6c7 100644 --- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S +++ b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S @@ -56,6 +56,31 @@ #define NSPACE #(31*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp x14, #(I); \ + ldp x12, x13, [x15]; \ + csel x0, x12, x0, eq; \ + csel x1, x13, x1, eq; \ + ldp x12, x13, [x15, #16]; \ + csel x2, x12, x2, eq; \ + csel x3, x13, x3, eq; \ + ldp x12, x13, [x15, #32]; \ + csel x4, x12, x4, eq; \ + csel x5, x13, x5, eq; \ + ldp x12, x13, [x15, #48]; \ + csel x6, x12, x6, eq; \ + csel x7, x13, x7, eq; \ + ldp x12, x13, [x15, #64]; \ + csel x8, x12, x8, eq; \ + csel x9, x13, x9, eq; \ + ldp x12, x13, [x15, #80]; \ + csel x10, x12, x10, eq; \ + csel x11, x13, x11, eq; \ + add x15, x15, #96 + // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ @@ -173,34 +198,34 @@ S2N_BN_SYMBOL(p256_montjscalarmul): add x0, tab+96*1 add x1, tab - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, tab+96*2 add x1, tab+96*1 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd add x0, tab+96*3 add x1, tab+96*1 - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, tab+96*4 add x1, tab+96*3 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd add x0, tab+96*5 add x1, tab+96*2 - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, tab+96*6 add x1, tab+96*5 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd add x0, tab+96*7 add x1, tab+96*3 - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble // Initialize the accumulator as a table entry for top 4 bits (unrecoded) @@ -221,30 +246,15 @@ S2N_BN_SYMBOL(p256_montjscalarmul): mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + stp x0, x1, [acc] stp x2, x3, [acc+16] stp x4, x5, [acc+32] @@ -256,24 +266,24 @@ S2N_BN_SYMBOL(p256_montjscalarmul): // Main loop over size-4 bitfields: double 4 times then add signed digit -loop: +p256_montjscalarmul_mainloop: sub j, j, #4 add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_p256_montjdouble lsr x2, j, #6 ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly @@ -299,30 +309,15 @@ loop: mov x10, xzr mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -357,9 +352,9 @@ loop: add x0, acc add x1, acc add x2, tabent - bl local_p256_montjadd + bl p256_montjscalarmul_p256_montjadd - cbnz j, loop + cbnz j, p256_montjscalarmul_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -386,7 +381,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_p256_montjadd: stp x19, x20, [sp, #-16]! stp x21, x22, [sp, #-16]! stp x23, x24, [sp, #-16]! @@ -3506,7 +3501,7 @@ local_p256_montjadd: ldp x19, x20, [sp], #16 ret -local_p256_montjdouble: +p256_montjscalarmul_p256_montjdouble: sub sp, sp, #0x110 stp x19, x20, [sp, #192] stp x21, x22, [sp, #208] diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S index 0e453f5bae..4cf5e375bf 100644 --- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S +++ b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S @@ -56,6 +56,31 @@ #define NSPACE #(31*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmp x14, #(I); \ + ldp x12, x13, [x15]; \ + csel x0, x12, x0, eq; \ + csel x1, x13, x1, eq; \ + ldp x12, x13, [x15, #16]; \ + csel x2, x12, x2, eq; \ + csel x3, x13, x3, eq; \ + ldp x12, x13, [x15, #32]; \ + csel x4, x12, x4, eq; \ + csel x5, x13, x5, eq; \ + ldp x12, x13, [x15, #48]; \ + csel x6, x12, x6, eq; \ + csel x7, x13, x7, eq; \ + ldp x12, x13, [x15, #64]; \ + csel x8, x12, x8, eq; \ + csel x9, x13, x9, eq; \ + ldp x12, x13, [x15, #80]; \ + csel x10, x12, x10, eq; \ + csel x11, x13, x11, eq; \ + add x15, x15, #96 + // Loading large constants #define movbig(nn,n3,n2,n1,n0) \ @@ -173,34 +198,34 @@ S2N_BN_SYMBOL(p256_montjscalarmul_alt): add x0, tab+96*1 add x1, tab - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, tab+96*2 add x1, tab+96*1 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd add x0, tab+96*3 add x1, tab+96*1 - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, tab+96*4 add x1, tab+96*3 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd add x0, tab+96*5 add x1, tab+96*2 - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, tab+96*6 add x1, tab+96*5 add x2, tab - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd add x0, tab+96*7 add x1, tab+96*3 - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble // Initialize the accumulator as a table entry for top 4 bits (unrecoded) @@ -221,30 +246,15 @@ S2N_BN_SYMBOL(p256_montjscalarmul_alt): mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + stp x0, x1, [acc] stp x2, x3, [acc+16] stp x4, x5, [acc+32] @@ -256,24 +266,24 @@ S2N_BN_SYMBOL(p256_montjscalarmul_alt): // Main loop over size-4 bitfields: double 4 times then add signed digit -loop: +p256_montjscalarmul_alt_mainloop: sub j, j, #4 add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble add x0, acc add x1, acc - bl local_p256_montjdouble + bl p256_montjscalarmul_alt_p256_montjdouble lsr x2, j, #6 ldr x14, [sp, x2, lsl #3] // Exploits scalarb = sp exactly @@ -299,30 +309,15 @@ loop: mov x10, xzr mov x11, xzr add x15, tab - .set i, 1 -.rep 8 - cmp x14, #i - ldp x12, x13, [x15] - csel x0, x12, x0, eq - csel x1, x13, x1, eq - ldp x12, x13, [x15, #16] - csel x2, x12, x2, eq - csel x3, x13, x3, eq - ldp x12, x13, [x15, #32] - csel x4, x12, x4, eq - csel x5, x13, x5, eq - ldp x12, x13, [x15, #48] - csel x6, x12, x6, eq - csel x7, x13, x7, eq - ldp x12, x13, [x15, #64] - csel x8, x12, x8, eq - csel x9, x13, x9, eq - ldp x12, x13, [x15, #80] - csel x10, x12, x10, eq - csel x11, x13, x11, eq - add x15, x15, #96 - .set i, (i+1) -.endr + + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -357,9 +352,9 @@ loop: add x0, acc add x1, acc add x2, tabent - bl local_p256_montjadd + bl p256_montjscalarmul_alt_p256_montjadd - cbnz j, loop + cbnz j, p256_montjscalarmul_alt_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -386,7 +381,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_alt_p256_montjadd: sub sp, sp, #0xe0 mov x15, x0 mov x16, x1 @@ -2316,7 +2311,7 @@ local_p256_montjadd: add sp, sp, #0xe0 ret -local_p256_montjdouble: +p256_montjscalarmul_alt_p256_montjdouble: sub sp, sp, #0xc0 mov x15, x0 mov x16, x1 diff --git a/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S b/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S index 1ae2eabe65..36f5d376e0 100644 --- a/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S +++ b/third_party/s2n-bignum/x86_att/p256/bignum_montinv_p256.S @@ -1116,9 +1116,9 @@ S2N_BN_SYMBOL(bignum_montinv_p256): movq $10, i movq $1, d - jmp midloop + jmp bignum_montinv_p256_midloop -loop: +bignum_montinv_p256_loop: // Separate out the matrix into sign-magnitude pairs @@ -1447,7 +1447,7 @@ loop: amontred(v) -midloop: +bignum_montinv_p256_midloop: divstep59(d,ff,gg) movq %rsi, d @@ -1455,7 +1455,7 @@ midloop: // Next iteration decq i - jnz loop + jnz bignum_montinv_p256_loop // The 10th and last iteration does not need anything except the // u value and the sign of f; the latter can be obtained from the diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S index 1a36a4c784..4569646cd3 100644 --- a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S +++ b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul.S @@ -57,6 +57,24 @@ #define NSPACE (32*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+96*(I-1)(%rsp), %rax ; \ + cmovzq TAB+96*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+96*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+96*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+96*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+96*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+96*(I-1)+48(%rsp), %r10 ; \ + cmovzq TAB+96*(I-1)+56(%rsp), %r11 ; \ + cmovzq TAB+96*(I-1)+64(%rsp), %r12 ; \ + cmovzq TAB+96*(I-1)+72(%rsp), %r13 ; \ + cmovzq TAB+96*(I-1)+80(%rsp), %r14 ; \ + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + S2N_BN_SYMBOL(p256_montjscalarmul): // The Windows version literally calls the standard ABI version. @@ -207,34 +225,34 @@ p256_montjscalarmul_standard: leaq TAB+96*1(%rsp), %rdi leaq TAB(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq TAB+96*2(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd leaq TAB+96*3(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq TAB+96*4(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd leaq TAB+96*5(%rsp), %rdi leaq TAB+96*2(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq TAB+96*6(%rsp), %rdi leaq TAB+96*5(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd leaq TAB+96*7(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble // Set up accumulator as table entry for top 4 bits (constant-time indexing) @@ -254,24 +272,15 @@ p256_montjscalarmul_standard: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi - - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) + movq %rax, ACC(%rsp) movq %rbx, ACC+8(%rsp) movq %rcx, ACC+16(%rsp) @@ -289,24 +298,24 @@ p256_montjscalarmul_standard: movl $252, %ebp -loop: +p256_montjscalarmul_mainloop: subq $4, %rbp leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_p256_montjdouble movq %rbp, %rax shrq $6, %rax @@ -333,24 +342,14 @@ loop: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi - - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -397,10 +396,10 @@ loop: leaq TABENT(%rsp), %rdx leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjadd + callq p256_montjscalarmul_p256_montjadd testq %rbp, %rbp - jne loop + jne p256_montjscalarmul_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -446,7 +445,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_p256_montjadd: pushq %rbx pushq %rbp pushq %r12 @@ -2428,7 +2427,7 @@ local_p256_montjadd: popq %rbx ret -local_p256_montjdouble: +p256_montjscalarmul_p256_montjdouble: pushq %rbx pushq %rbp pushq %r12 diff --git a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S index 51d55eee86..b68d857e76 100644 --- a/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S +++ b/third_party/s2n-bignum/x86_att/p256/p256_montjscalarmul_alt.S @@ -57,6 +57,24 @@ #define NSPACE (32*NUMSIZE) +// Avoid using .rep for the sake of the BoringSSL/AWS-LC delocator, +// which doesn't accept repetitions, assembler macros etc. + +#define selectblock(I) \ + cmpq $I, %rdi ; \ + cmovzq TAB+96*(I-1)(%rsp), %rax ; \ + cmovzq TAB+96*(I-1)+8(%rsp), %rbx ; \ + cmovzq TAB+96*(I-1)+16(%rsp), %rcx ; \ + cmovzq TAB+96*(I-1)+24(%rsp), %rdx ; \ + cmovzq TAB+96*(I-1)+32(%rsp), %r8 ; \ + cmovzq TAB+96*(I-1)+40(%rsp), %r9 ; \ + cmovzq TAB+96*(I-1)+48(%rsp), %r10 ; \ + cmovzq TAB+96*(I-1)+56(%rsp), %r11 ; \ + cmovzq TAB+96*(I-1)+64(%rsp), %r12 ; \ + cmovzq TAB+96*(I-1)+72(%rsp), %r13 ; \ + cmovzq TAB+96*(I-1)+80(%rsp), %r14 ; \ + cmovzq TAB+96*(I-1)+88(%rsp), %r15 + S2N_BN_SYMBOL(p256_montjscalarmul_alt): // The Windows version literally calls the standard ABI version. @@ -207,34 +225,34 @@ p256_montjscalarmul_alt_standard: leaq TAB+96*1(%rsp), %rdi leaq TAB(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq TAB+96*2(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd leaq TAB+96*3(%rsp), %rdi leaq TAB+96*1(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq TAB+96*4(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd leaq TAB+96*5(%rsp), %rdi leaq TAB+96*2(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq TAB+96*6(%rsp), %rdi leaq TAB+96*5(%rsp), %rsi leaq TAB(%rsp), %rdx - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd leaq TAB+96*7(%rsp), %rdi leaq TAB+96*3(%rsp), %rsi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble // Set up accumulator as table entry for top 4 bits (constant-time indexing) @@ -254,24 +272,15 @@ p256_montjscalarmul_alt_standard: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr movq %rax, ACC(%rsp) movq %rbx, ACC+8(%rsp) movq %rcx, ACC+16(%rsp) @@ -289,24 +298,24 @@ p256_montjscalarmul_alt_standard: movl $252, %ebp -loop: +p256_montjscalarmul_alt_mainloop: subq $4, %rbp leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjdouble + callq p256_montjscalarmul_alt_p256_montjdouble movq %rbp, %rax shrq $6, %rax @@ -333,24 +342,14 @@ loop: xorl %r14d, %r14d xorl %r15d, %r15d - .set I, 1 -.rep 8 - cmpq $I, %rdi - - cmovzq TAB+96*(I-1)(%rsp), %rax - cmovzq TAB+96*(I-1)+8(%rsp), %rbx - cmovzq TAB+96*(I-1)+16(%rsp), %rcx - cmovzq TAB+96*(I-1)+24(%rsp), %rdx - cmovzq TAB+96*(I-1)+32(%rsp), %r8 - cmovzq TAB+96*(I-1)+40(%rsp), %r9 - cmovzq TAB+96*(I-1)+48(%rsp), %r10 - cmovzq TAB+96*(I-1)+56(%rsp), %r11 - cmovzq TAB+96*(I-1)+64(%rsp), %r12 - cmovzq TAB+96*(I-1)+72(%rsp), %r13 - cmovzq TAB+96*(I-1)+80(%rsp), %r14 - cmovzq TAB+96*(I-1)+88(%rsp), %r15 - .set I, (I+1) -.endr + selectblock(1) + selectblock(2) + selectblock(3) + selectblock(4) + selectblock(5) + selectblock(6) + selectblock(7) + selectblock(8) // Store it to "tabent" with the y coordinate optionally negated // Again, do it carefully to give coordinates < p_256 even in @@ -397,10 +396,10 @@ loop: leaq TABENT(%rsp), %rdx leaq ACC(%rsp), %rsi leaq ACC(%rsp), %rdi - callq local_p256_montjadd + callq p256_montjscalarmul_alt_p256_montjadd testq %rbp, %rbp - jne loop + jne p256_montjscalarmul_alt_mainloop // That's the end of the main loop, and we just need to copy the // result in "acc" to the output. @@ -446,7 +445,7 @@ loop: // Local copies of subroutines, complete clones at the moment -local_p256_montjadd: +p256_montjscalarmul_alt_p256_montjadd: pushq %rbx pushq %rbp pushq %r12 @@ -3218,7 +3217,7 @@ local_p256_montjadd: popq %rbx ret -local_p256_montjdouble: +p256_montjscalarmul_alt_p256_montjdouble: pushq %rbx pushq %r12 pushq %r13 From 5523a26c6831f9c96592316fe657980d34571300 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Fri, 23 Aug 2024 13:10:25 -0700 Subject: [PATCH 4/4] Make two more tiny tweaks to satisfy the delocator Changing #(I) to #(1*I) in the new macro blocks. --- third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S | 2 +- third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S index 4c7f21a6c7..246421ff37 100644 --- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S +++ b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul.S @@ -60,7 +60,7 @@ // which doesn't accept repetitions, assembler macros etc. #define selectblock(I) \ - cmp x14, #(I); \ + cmp x14, #(1*I); \ ldp x12, x13, [x15]; \ csel x0, x12, x0, eq; \ csel x1, x13, x1, eq; \ diff --git a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S index 4cf5e375bf..8ac5806a72 100644 --- a/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S +++ b/third_party/s2n-bignum/arm/p256/p256_montjscalarmul_alt.S @@ -60,7 +60,7 @@ // which doesn't accept repetitions, assembler macros etc. #define selectblock(I) \ - cmp x14, #(I); \ + cmp x14, #(1*I); \ ldp x12, x13, [x15]; \ csel x0, x12, x0, eq; \ csel x1, x13, x1, eq; \