Files
mars-matrixssl/crypto/math/pstm_sqr_comba.c
Janne Johansson d0a51a7e43 MatrixSSL 4.0.0
2018-09-13 12:17:26 +03:00

1238 lines
51 KiB
C

/**
* @file pstm_sqr_comba.c
* @version $Format:%h%d$
*
* Multiprecision Squaring with Comba technique.
*/
/*
* Copyright (c) 2013-2018 INSIDE Secure Corporation
* Copyright (c) PeerSec Networks, 2002-2011
* All Rights Reserved
*
* The latest version of this code is available at http://www.matrixssl.org
*
* This software is open source; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This General Public License does NOT permit incorporating this software
* into proprietary programs. If you are unable to comply with the GPL, a
* commercial license for this software may be purchased from INSIDE at
* http://www.insidesecure.com/
*
* This program is distributed in WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
* See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
* http://www.gnu.org/copyleft/gpl.html
*/
/******************************************************************************/
#include "../cryptoImpl.h"
#if defined(USE_MATRIX_RSA) || defined(USE_MATRIX_ECC) || defined(USE_MATRIX_DH)
/******************************************************************************/
# if defined(PSTM_X86)
/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
# if !defined(__GNUC__) || !defined(__i386__)
# error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
# endif
/* #pragma message ("Using 32 bit x86 Assembly Optimizations") */
# define COMBA_START
# define CLEAR_CARRY \
c0 = c1 = c2 = 0;
# define COMBA_STORE(x) \
x = c0;
# define COMBA_STORE2(x) \
x = c1;
# define CARRY_FORWARD \
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
# define COMBA_FINI
# define SQRADD(i, j) \
asm ( \
"movl %6,%%eax \n\t" \
"mull %%eax \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "m" (i) : "%eax", "%edx", "cc");
# define SQRADD2(i, j) \
asm ( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "m" (i), "m" (j) : "%eax", "%edx", "cc");
# define SQRADDSC(i, j) \
asm ( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"movl %%eax,%0 \n\t" \
"movl %%edx,%1 \n\t" \
"xorl %2,%2 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "g" (i), "g" (j) : "%eax", "%edx", "cc");
# define SQRADDAC(i, j) \
asm ( \
"movl %6,%%eax \n\t" \
"mull %7 \n\t" \
"addl %%eax,%0 \n\t" \
"adcl %%edx,%1 \n\t" \
"adcl $0,%2 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "g" (i), "g" (j) : "%eax", "%edx", "cc");
# define SQRADDDB \
asm ( \
"addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \
"addl %6,%0 \n\t" \
"adcl %7,%1 \n\t" \
"adcl %8,%2 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "r" (sc0), "r" (sc1), "r" (sc2) : "cc");
/******************************************************************************/
# elif defined(PSTM_X86_64)
/* x86-64 optimized */
# if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
# error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
# endif
/* #pragma message ("Using 64 bit x86_64 Assembly Optimizations") */
# define COMBA_START
# define CLEAR_CARRY \
c0 = c1 = c2 = 0;
# define COMBA_STORE(x) \
x = c0;
# define COMBA_STORE2(x) \
x = c1;
# define CARRY_FORWARD \
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
# define COMBA_FINI
# define SQRADD(i, j) \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %%rax \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "g" (i) : "%rax", "%rdx", "cc");
# define SQRADD2(i, j) \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "g" (i), "g" (j) : "%rax", "%rdx", "cc");
# define SQRADDSC(i, j) \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"movq %%rax,%0 \n\t" \
"movq %%rdx,%1 \n\t" \
"xorq %2,%2 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "g" (i), "g" (j) : "%rax", "%rdx", "cc");
# define SQRADDAC(i, j) \
asm ( \
"movq %6,%%rax \n\t" \
"mulq %7 \n\t" \
"addq %%rax,%0 \n\t" \
"adcq %%rdx,%1 \n\t" \
"adcq $0,%2 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "g" (i), "g" (j) : "%rax", "%rdx", "cc");
# define SQRADDDB \
asm ( \
"addq %6,%0 \n\t" \
"adcq %7,%1 \n\t" \
"adcq %8,%2 \n\t" \
"addq %6,%0 \n\t" \
"adcq %7,%1 \n\t" \
"adcq %8,%2 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "r" (sc0), "r" (sc1), "r" (sc2) : "cc");
/******************************************************************************/
# elif defined(PSTM_ARM) && !defined __ARMCC5
/* ARM code */
/* #pragma message ("Using 32 bit ARM Assembly Optimizations") */
# if defined __arm__ && defined __thumb__ && !defined __thumb2__
# error "Platform not supported: Thumb1 mode on ARMv4-v6."
# error "Please, turn off thumb mode or enable thumb2."
# endif
# define COMBA_START
# define CLEAR_CARRY \
c0 = c1 = c2 = 0;
# define COMBA_STORE(x) \
x = c0;
# define COMBA_STORE2(x) \
x = c1;
# define CARRY_FORWARD \
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
# define COMBA_FINI
/* multiplies point i and j, updates carry "c1" and digit c2 */
# define SQRADD(i, j) \
asm ( \
" UMULL r0,r1,%6,%6 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2,%2,#0 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "r" (i) : "r0", "r1", "cc");
/* for squaring some of the terms are doubled... */
# define SQRADD2(i, j) \
asm ( \
" UMULL r0,r1,%6,%7 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2,%2,#0 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2,%2,#0 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "r" (i), "r" (j) : "r0", "r1", "cc");
# define SQRADDSC(i, j) \
asm ( \
" UMULL %0,%1,%6,%7 \n\t" \
" SUB %2,%2,%2 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "r" (i), "r" (j) : "cc");
# define SQRADDAC(i, j) \
asm ( \
" UMULL r0,r1,%6,%7 \n\t" \
" ADDS %0,%0,r0 \n\t" \
" ADCS %1,%1,r1 \n\t" \
" ADC %2,%2,#0 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "r" (i), "r" (j) : "r0", "r1", "cc");
# define SQRADDDB \
asm ( \
" ADDS %0,%0,%3 \n\t" \
" ADCS %1,%1,%4 \n\t" \
" ADC %2,%2,%5 \n\t" \
" ADDS %0,%0,%3 \n\t" \
" ADCS %1,%1,%4 \n\t" \
" ADC %2,%2,%5 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "r" (sc0), "r" (sc1), "r" (sc2), "0" (c0), "1" (c1), "2" (c2) : "cc");
/******************************************************************************/
# elif defined(PSTM_ARM) && defined __ARMCC5
/* ARM Compiler 5 support: */
#pragma arm /* ARM code. Switch code generation to the ARM instruction set so
that the inline assembler is available. On platforms with
only Thumb code support, it is necessary to use NO_PSTM_ARM
as ARM Compiler 5 lacks thumb inline assembly support. */
# define COMBA_START
# define CLEAR_CARRY \
c0 = c1 = c2 = 0;
# define COMBA_STORE(x) \
x = c0;
# define COMBA_STORE2(x) \
x = c1;
# define CARRY_FORWARD \
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
# define COMBA_FINI
/* multiplies point i and j, updates carry "c1" and digit c2 */
# define SQRADD(i, j) \
do { \
unsigned int reg0; \
unsigned int reg1; \
unsigned int i_in = i; \
/* no j_in; j is the same than i. */ \
__asm { \
UMULL reg0, reg1, i_in, i_in; \
ADDS c0, c0, reg0; \
ADCS c1, c1, reg1; \
ADC c2, c2, 0; \
} \
} while(0);
/* for squaring some of the terms are doubled... */
# define SQRADD2(i, j) \
do { \
unsigned int reg0; \
unsigned int reg1; \
unsigned int i_in = i; \
unsigned int j_in = j; \
__asm { \
UMULL reg0, reg1, i_in, j_in; \
ADDS c0, c0, reg0; \
ADCS c1, c1, reg1; \
ADC c2, c2, 0; \
ADDS c0, c0, reg0; \
ADCS c1, c1, reg1; \
ADC c2, c2, 0; \
} \
} while(0);
# define SQRADDSC(i, j) \
do { \
unsigned int i_in = i; \
unsigned int j_in = j; \
__asm { \
UMULL sc0, sc1, i_in, j_in; \
SUB sc2, sc2, sc2; \
} \
} while(0);
# define SQRADDAC(i, j) \
do { \
unsigned int reg0; \
unsigned int reg1; \
unsigned int i_in = i; \
unsigned int j_in = j; \
__asm { \
UMULL reg0, reg1, i_in, j_in; \
ADDS sc0, sc0, reg0; \
ADCS sc1, sc1, reg1; \
ADC sc2, sc2, 0; \
} \
} while(0);
# define SQRADDDB \
do { \
__asm { \
ADDS c0, c0, sc0; \
ADCS c1, c1, sc1; \
ADC c2, c2, sc2; \
ADDS c0, c0, sc0; \
ADCS c1, c1, sc1; \
ADC c2, c2, sc2; \
} \
} while(0);
/******************************************************************************/
# elif defined(PSTM_MIPS)
/* MIPS32 */
/* #pragma message ("Using 32 bit MIPS Assembly Optimizations") */
# define COMBA_START
# define CLEAR_CARRY \
c0 = c1 = c2 = 0;
# define COMBA_STORE(x) \
x = c0;
# define COMBA_STORE2(x) \
x = c1;
# define CARRY_FORWARD \
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
# define COMBA_FINI
/* multiplies point i and j, updates carry "c1" and digit c2 */
# define SQRADD(i, j) \
asm ( \
" multu %6,%6 \n\t" \
" mflo $12 \n\t" \
" mfhi $13 \n\t" \
" addu %0,%0,$12 \n\t" \
" sltu $12,%0,$12 \n\t" \
" addu %1,%1,$13 \n\t" \
" sltu $13,%1,$13 \n\t" \
" addu %1,%1,$12 \n\t" \
" sltu $12,%1,$12 \n\t" \
" addu %2,%2,$13 \n\t" \
" addu %2,%2,$12 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "r" (i) : "$12", "$13");
/* for squaring some of the terms are doubled... */
# define SQRADD2(i, j) \
asm ( \
" multu %6,%7 \n\t" \
" mflo $12 \n\t" \
" mfhi $13 \n\t" \
\
" addu %0,%0,$12 \n\t" \
" sltu $14,%0,$12 \n\t" \
" addu %1,%1,$13 \n\t" \
" sltu $15,%1,$13 \n\t" \
" addu %1,%1,$14 \n\t" \
" sltu $14,%1,$14 \n\t" \
" addu %2,%2,$15 \n\t" \
" addu %2,%2,$14 \n\t" \
\
" addu %0,%0,$12 \n\t" \
" sltu $14,%0,$12 \n\t" \
" addu %1,%1,$13 \n\t" \
" sltu $15,%1,$13 \n\t" \
" addu %1,%1,$14 \n\t" \
" sltu $14,%1,$14 \n\t" \
" addu %2,%2,$15 \n\t" \
" addu %2,%2,$14 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "0" (c0), "1" (c1), "2" (c2), "r" (i), "r" (j) : "$12", "$13", "$14", "$15");
# define SQRADDSC(i, j) \
asm ( \
" multu %6,%7 \n\t" \
" mflo %0 \n\t" \
" mfhi %1 \n\t" \
" xor %2,%2,%2 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "r" (i), "r" (j) : "cc");
# define SQRADDAC(i, j) \
asm ( \
" multu %6,%7 \n\t" \
" mflo $12 \n\t" \
" mfhi $13 \n\t" \
" addu %0,%0,$12 \n\t" \
" sltu $12,%0,$12 \n\t" \
" addu %1,%1,$13 \n\t" \
" sltu $13,%1,$13 \n\t" \
" addu %1,%1,$12 \n\t" \
" sltu $12,%1,$12 \n\t" \
" addu %2,%2,$13 \n\t" \
" addu %2,%2,$12 \n\t" \
: "=r" (sc0), "=r" (sc1), "=r" (sc2) : "0" (sc0), "1" (sc1), "2" (sc2), "r" (i), "r" (j) : "$12", "$13", "$14");
# define SQRADDDB \
asm ( \
" addu %0,%0,%3 \n\t" \
" sltu $10,%0,%3 \n\t" \
" addu %1,%1,$10 \n\t" \
" sltu $10,%1,$10 \n\t" \
" addu %1,%1,%4 \n\t" \
" sltu $11,%1,%4 \n\t" \
" addu %2,%2,$10 \n\t" \
" addu %2,%2,$11 \n\t" \
" addu %2,%2,%5 \n\t" \
\
" addu %0,%0,%3 \n\t" \
" sltu $10,%0,%3 \n\t" \
" addu %1,%1,$10 \n\t" \
" sltu $10,%1,$10 \n\t" \
" addu %1,%1,%4 \n\t" \
" sltu $11,%1,%4 \n\t" \
" addu %2,%2,$10 \n\t" \
" addu %2,%2,$11 \n\t" \
" addu %2,%2,%5 \n\t" \
: "=r" (c0), "=r" (c1), "=r" (c2) : "r" (sc0), "r" (sc1), "r" (sc2), "0" (c0), "1" (c1), "2" (c2) : "$10", "$11");
# else
/******************************************************************************/
# define PSTM_ISO
/* ISO C portable code */
# define COMBA_START
# define CLEAR_CARRY \
c0 = c1 = c2 = 0;
# define COMBA_STORE(x) \
x = c0;
# define COMBA_STORE2(x) \
x = c1;
# define CARRY_FORWARD \
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
# define COMBA_FINI
/* multiplies point i and j, updates carry "c1" and digit c2 */
# define SQRADD(i, j) \
do { pstm_word t; \
t = c0 + ((pstm_word) i) * ((pstm_word) j); c0 = (pstm_digit) t; \
t = c1 + (t >> DIGIT_BIT); \
c1 = (pstm_digit) t; c2 += (pstm_digit) (t >> DIGIT_BIT); \
} while (0);
/* for squaring some of the terms are doubled... */
# define SQRADD2(i, j) \
do { pstm_word t; \
t = ((pstm_word) i) * ((pstm_word) j); \
tt = (pstm_word) c0 + t; c0 = (pstm_digit) tt; \
tt = (pstm_word) c1 + (tt >> DIGIT_BIT); \
c1 = (pstm_digit) tt; c2 += (pstm_digit) (tt >> DIGIT_BIT); \
tt = (pstm_word) c0 + t; c0 = (pstm_digit) tt; \
tt = (pstm_word) c1 + (tt >> DIGIT_BIT); \
c1 = (pstm_digit) tt; c2 += (pstm_digit) (tt >> DIGIT_BIT); \
} while (0);
# define SQRADDSC(i, j) \
do { pstm_word t; \
t = ((pstm_word) i) * ((pstm_word) j); \
sc0 = (pstm_digit) t; sc1 = (pstm_digit) (t >> DIGIT_BIT); sc2 = 0; \
} while (0);
# define SQRADDAC(i, j) \
do { pstm_word t; \
t = ((pstm_word) sc0) + ((pstm_word) i) * ((pstm_word) j); \
sc0 = (pstm_digit) t; \
t = ((pstm_word) sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit) t; \
sc2 += (pstm_digit) (t >> DIGIT_BIT); \
} while (0);
# define SQRADDDB \
do { pstm_word t; \
t = ((pstm_word) sc0) + ((pstm_word) sc0) + ((pstm_word) c0); \
c0 = (pstm_digit) t; \
t = ((pstm_word) sc1) + ((pstm_word) sc1) + c1 + (t >> DIGIT_BIT); \
c1 = (pstm_digit) t; \
c2 = c2 + sc2 + sc2 + (pstm_digit) (t >> DIGIT_BIT); \
} while (0);
# endif /* ISO_C */
/******************************************************************************/
/*
Non-unrolled comba squarer
*/
static int32_t pstm_sqr_comba_gen(psPool_t *pool, const pstm_int *A,
pstm_int *B, pstm_digit *paD, psSize_t paDlen)
{
int16 paDfail, pa;
int32 ix, iz;
pstm_digit c0, c1, c2, *dst;
# ifdef PSTM_ISO
pstm_word tt;
# endif
paDfail = 0;
/* get size of output and trim */
pa = A->used + A->used;
/* number of output digits to produce */
COMBA_START;
CLEAR_CARRY;
/*
If b is not large enough grow it and continue
*/
if (B->alloc < pa)
{
if (pstm_grow(B, pa) != PSTM_OKAY)
{
return PS_MEM_FAIL;
}
}
if (paD != NULL)
{
if (paDlen < (sizeof(pstm_digit) * pa))
{
paDfail = 1; /* have a paD, but it's not big enough */
if ((dst = psMalloc(pool, sizeof(pstm_digit) * pa)) == NULL)
{
return PS_MEM_FAIL;
}
Memset(dst, 0x0, sizeof(pstm_digit) * pa);
}
else
{
dst = paD;
Memset(dst, 0x0, paDlen);
}
}
else
{
if ((dst = psMalloc(pool, sizeof(pstm_digit) * pa)) == NULL)
{
return PS_MEM_FAIL;
}
Memset(dst, 0x0, sizeof(pstm_digit) * pa);
}
for (ix = 0; ix < pa; ix++)
{
int32 tx, ty, iy;
pstm_digit *tmpy, *tmpx;
/* get offsets into the two bignums */
ty = min(A->used - 1, ix);
tx = ix - ty;
/* setup temp aliases */
tmpx = A->dp + tx;
tmpy = A->dp + ty;
/*
This is the number of times the loop will iterate
while (tx++ < a->used && ty-- >= 0) { ... }
*/
iy = min(A->used - tx, ty + 1);
/*
now for squaring, tx can never equal ty. We halve the distance since
they approach at a rate of 2x and we have to round because odd cases
need to be executed
*/
iy = min(iy, (ty - tx + 1) >> 1);
/* forward carries */
CARRY_FORWARD;
/* execute loop */
for (iz = 0; iz < iy; iz++)
{
SQRADD2(*tmpx++, *tmpy--);
}
/* even columns have the square term in them */
if ((ix & 1) == 0)
{
SQRADD(A->dp[ix >> 1], A->dp[ix >> 1]);
}
/* store it */
COMBA_STORE(dst[ix]);
}
COMBA_FINI;
/*
setup dest
*/
iz = B->used;
B->used = pa;
{
pstm_digit *tmpc;
tmpc = B->dp;
for (ix = 0; ix < pa; ix++)
{
*tmpc++ = dst[ix];
}
/* clear unused digits (that existed in the old copy of c) */
for (; ix < iz; ix++)
{
*tmpc++ = 0;
}
}
pstm_clamp(B);
if ((paD == NULL) || paDfail == 1)
{
psFree(dst, pool);
}
return PS_SUCCESS;
}
/******************************************************************************/
/*
Unrolled Comba loop for 1024 bit keys
*/
# ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
static int32_t pstm_sqr_comba16(const pstm_int *A, pstm_int *B)
{
pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
# ifdef PSTM_ISO
pstm_word tt;
# endif
if (B->alloc < 32)
{
if (pstm_grow(B, 32) != PSTM_OKAY)
{
return PS_MEM_FAIL;
}
}
a = A->dp;
sc0 = sc1 = sc2 = 0;
COMBA_START;
/* clear carries */
CLEAR_CARRY;
/* output 0 */
SQRADD(a[0], a[0]);
COMBA_STORE(b[0]);
/* output 1 */
CARRY_FORWARD;
SQRADD2(a[0], a[1]);
COMBA_STORE(b[1]);
/* output 2 */
CARRY_FORWARD;
SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
COMBA_STORE(b[2]);
/* output 3 */
CARRY_FORWARD;
SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
COMBA_STORE(b[3]);
/* output 4 */
CARRY_FORWARD;
SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
COMBA_STORE(b[4]);
/* output 5 */
CARRY_FORWARD;
SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
COMBA_STORE(b[5]);
/* output 6 */
CARRY_FORWARD;
SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
COMBA_STORE(b[6]);
/* output 7 */
CARRY_FORWARD;
SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
COMBA_STORE(b[7]);
/* output 8 */
CARRY_FORWARD;
SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
COMBA_STORE(b[8]);
/* output 9 */
CARRY_FORWARD;
SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
COMBA_STORE(b[9]);
/* output 10 */
CARRY_FORWARD;
SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
COMBA_STORE(b[10]);
/* output 11 */
CARRY_FORWARD;
SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
COMBA_STORE(b[11]);
/* output 12 */
CARRY_FORWARD;
SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
COMBA_STORE(b[12]);
/* output 13 */
CARRY_FORWARD;
SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
COMBA_STORE(b[13]);
/* output 14 */
CARRY_FORWARD;
SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
COMBA_STORE(b[14]);
/* output 15 */
CARRY_FORWARD;
SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
COMBA_STORE(b[15]);
/* output 16 */
CARRY_FORWARD;
SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
COMBA_STORE(b[16]);
/* output 17 */
CARRY_FORWARD;
SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
COMBA_STORE(b[17]);
/* output 18 */
CARRY_FORWARD;
SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
COMBA_STORE(b[18]);
/* output 19 */
CARRY_FORWARD;
SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
COMBA_STORE(b[19]);
/* output 20 */
CARRY_FORWARD;
SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
COMBA_STORE(b[20]);
/* output 21 */
CARRY_FORWARD;
SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
COMBA_STORE(b[21]);
/* output 22 */
CARRY_FORWARD;
SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
COMBA_STORE(b[22]);
/* output 23 */
CARRY_FORWARD;
SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
COMBA_STORE(b[23]);
/* output 24 */
CARRY_FORWARD;
SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
COMBA_STORE(b[24]);
/* output 25 */
CARRY_FORWARD;
SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
COMBA_STORE(b[25]);
/* output 26 */
CARRY_FORWARD;
SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
COMBA_STORE(b[26]);
/* output 27 */
CARRY_FORWARD;
SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
COMBA_STORE(b[27]);
/* output 28 */
CARRY_FORWARD;
SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
COMBA_STORE(b[28]);
/* output 29 */
CARRY_FORWARD;
SQRADD2(a[14], a[15]);
COMBA_STORE(b[29]);
/* output 30 */
CARRY_FORWARD;
SQRADD(a[15], a[15]);
COMBA_STORE(b[30]);
COMBA_STORE2(b[31]);
COMBA_FINI;
B->used = 32;
B->sign = PSTM_ZPOS;
Memcpy(B->dp, b, 32 * sizeof(pstm_digit));
pstm_clamp(B);
return PSTM_OKAY;
}
# endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
# ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
static int32_t pstm_sqr_comba32(const pstm_int *A, pstm_int *B)
{
pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
# ifdef PSTM_ISO
pstm_word tt;
# endif
if (B->alloc < 64)
{
if (pstm_grow(B, 64) != PSTM_OKAY)
{
return PS_MEM_FAIL;
}
}
sc0 = sc1 = sc2 = 0;
a = A->dp;
COMBA_START;
/* clear carries */
CLEAR_CARRY;
/* output 0 */
SQRADD(a[0], a[0]);
COMBA_STORE(b[0]);
/* output 1 */
CARRY_FORWARD;
SQRADD2(a[0], a[1]);
COMBA_STORE(b[1]);
/* output 2 */
CARRY_FORWARD;
SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
COMBA_STORE(b[2]);
/* output 3 */
CARRY_FORWARD;
SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
COMBA_STORE(b[3]);
/* output 4 */
CARRY_FORWARD;
SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
COMBA_STORE(b[4]);
/* output 5 */
CARRY_FORWARD;
SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
COMBA_STORE(b[5]);
/* output 6 */
CARRY_FORWARD;
SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
COMBA_STORE(b[6]);
/* output 7 */
CARRY_FORWARD;
SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
COMBA_STORE(b[7]);
/* output 8 */
CARRY_FORWARD;
SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
COMBA_STORE(b[8]);
/* output 9 */
CARRY_FORWARD;
SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
COMBA_STORE(b[9]);
/* output 10 */
CARRY_FORWARD;
SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
COMBA_STORE(b[10]);
/* output 11 */
CARRY_FORWARD;
SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
COMBA_STORE(b[11]);
/* output 12 */
CARRY_FORWARD;
SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
COMBA_STORE(b[12]);
/* output 13 */
CARRY_FORWARD;
SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
COMBA_STORE(b[13]);
/* output 14 */
CARRY_FORWARD;
SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
COMBA_STORE(b[14]);
/* output 15 */
CARRY_FORWARD;
SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
COMBA_STORE(b[15]);
/* output 16 */
CARRY_FORWARD;
SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
COMBA_STORE(b[16]);
/* output 17 */
CARRY_FORWARD;
SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
COMBA_STORE(b[17]);
/* output 18 */
CARRY_FORWARD;
SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
COMBA_STORE(b[18]);
/* output 19 */
CARRY_FORWARD;
SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
COMBA_STORE(b[19]);
/* output 20 */
CARRY_FORWARD;
SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
COMBA_STORE(b[20]);
/* output 21 */
CARRY_FORWARD;
SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
COMBA_STORE(b[21]);
/* output 22 */
CARRY_FORWARD;
SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
COMBA_STORE(b[22]);
/* output 23 */
CARRY_FORWARD;
SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
COMBA_STORE(b[23]);
/* output 24 */
CARRY_FORWARD;
SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
COMBA_STORE(b[24]);
/* output 25 */
CARRY_FORWARD;
SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
COMBA_STORE(b[25]);
/* output 26 */
CARRY_FORWARD;
SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
COMBA_STORE(b[26]);
/* output 27 */
CARRY_FORWARD;
SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
COMBA_STORE(b[27]);
/* output 28 */
CARRY_FORWARD;
SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
COMBA_STORE(b[28]);
/* output 29 */
CARRY_FORWARD;
SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
COMBA_STORE(b[29]);
/* output 30 */
CARRY_FORWARD;
SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
COMBA_STORE(b[30]);
/* output 31 */
CARRY_FORWARD;
SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
COMBA_STORE(b[31]);
/* output 32 */
CARRY_FORWARD;
SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
COMBA_STORE(b[32]);
/* output 33 */
CARRY_FORWARD;
SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
COMBA_STORE(b[33]);
/* output 34 */
CARRY_FORWARD;
SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
COMBA_STORE(b[34]);
/* output 35 */
CARRY_FORWARD;
SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
COMBA_STORE(b[35]);
/* output 36 */
CARRY_FORWARD;
SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
COMBA_STORE(b[36]);
/* output 37 */
CARRY_FORWARD;
SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
COMBA_STORE(b[37]);
/* output 38 */
CARRY_FORWARD;
SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
COMBA_STORE(b[38]);
/* output 39 */
CARRY_FORWARD;
SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
COMBA_STORE(b[39]);
/* output 40 */
CARRY_FORWARD;
SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
COMBA_STORE(b[40]);
/* output 41 */
CARRY_FORWARD;
SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
COMBA_STORE(b[41]);
/* output 42 */
CARRY_FORWARD;
SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
COMBA_STORE(b[42]);
/* output 43 */
CARRY_FORWARD;
SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
COMBA_STORE(b[43]);
/* output 44 */
CARRY_FORWARD;
SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
COMBA_STORE(b[44]);
/* output 45 */
CARRY_FORWARD;
SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
COMBA_STORE(b[45]);
/* output 46 */
CARRY_FORWARD;
SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
COMBA_STORE(b[46]);
/* output 47 */
CARRY_FORWARD;
SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
COMBA_STORE(b[47]);
/* output 48 */
CARRY_FORWARD;
SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
COMBA_STORE(b[48]);
/* output 49 */
CARRY_FORWARD;
SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
COMBA_STORE(b[49]);
/* output 50 */
CARRY_FORWARD;
SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
COMBA_STORE(b[50]);
/* output 51 */
CARRY_FORWARD;
SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
COMBA_STORE(b[51]);
/* output 52 */
CARRY_FORWARD;
SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
COMBA_STORE(b[52]);
/* output 53 */
CARRY_FORWARD;
SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
COMBA_STORE(b[53]);
/* output 54 */
CARRY_FORWARD;
SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
COMBA_STORE(b[54]);
/* output 55 */
CARRY_FORWARD;
SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
COMBA_STORE(b[55]);
/* output 56 */
CARRY_FORWARD;
SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
COMBA_STORE(b[56]);
/* output 57 */
CARRY_FORWARD;
SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
COMBA_STORE(b[57]);
/* output 58 */
CARRY_FORWARD;
SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
COMBA_STORE(b[58]);
/* output 59 */
CARRY_FORWARD;
SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
COMBA_STORE(b[59]);
/* output 60 */
CARRY_FORWARD;
SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
COMBA_STORE(b[60]);
/* output 61 */
CARRY_FORWARD;
SQRADD2(a[30], a[31]);
COMBA_STORE(b[61]);
/* output 62 */
CARRY_FORWARD;
SQRADD(a[31], a[31]);
COMBA_STORE(b[62]);
COMBA_STORE2(b[63]);
COMBA_FINI;
B->used = 64;
B->sign = PSTM_ZPOS;
Memcpy(B->dp, b, 64 * sizeof(pstm_digit));
pstm_clamp(B);
return PSTM_OKAY;
}
# endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
/******************************************************************************/
/**
B = A**2.
@param[in] pool Memory pool
@param[in] A Base
@param[out] B Result
@param[in,out] paD Temporary storage
@param[in] paDlen Number of items pointed to by paD
*/
int32_t pstm_sqr_comba(psPool_t *pool, const pstm_int *A, pstm_int *B,
pstm_digit *paD, psSize_t paDlen)
{
# ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
if (A->used == 16)
{
return pstm_sqr_comba16(A, B);
}
else
{
# ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
if (A->used == 32)
{
return pstm_sqr_comba32(A, B);
}
# endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
}
# else
# ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
if (A->used == 32)
{
return pstm_sqr_comba32(A, B);
}
# endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
# endif
}
#endif /* defined(USE_MATRIX_RSA) || defined(USE_MATRIX_ECC) */
/******************************************************************************/