aboutsummaryrefslogtreecommitdiff
path: root/networking/tls_pstm_sqr_comba.c
diff options
context:
space:
mode:
Diffstat (limited to 'networking/tls_pstm_sqr_comba.c')
-rw-r--r--networking/tls_pstm_sqr_comba.c1107
1 files changed, 1107 insertions, 0 deletions
diff --git a/networking/tls_pstm_sqr_comba.c b/networking/tls_pstm_sqr_comba.c
new file mode 100644
index 000000000..98186d31f
--- /dev/null
+++ b/networking/tls_pstm_sqr_comba.c
@@ -0,0 +1,1107 @@
1/*
2 * Copyright (C) 2017 Denys Vlasenko
3 *
4 * Licensed under GPLv2, see file LICENSE in this source tree.
5 */
6#include "tls.h"
7
8/**
9 * @file pstm_sqr_comba.c
10 * @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
11 *
12 * Multiprecision Squaring with Comba technique.
13 */
14/*
15 * Copyright (c) 2013-2015 INSIDE Secure Corporation
16 * Copyright (c) PeerSec Networks, 2002-2011
17 * All Rights Reserved
18 *
19 * The latest version of this code is available at http://www.matrixssl.org
20 *
21 * This software is open source; you can redistribute it and/or modify
22 * it under the terms of the GNU General Public License as published by
23 * the Free Software Foundation; either version 2 of the License, or
24 * (at your option) any later version.
25 *
26 * This General Public License does NOT permit incorporating this software
27 * into proprietary programs. If you are unable to comply with the GPL, a
28 * commercial license for this software may be purchased from INSIDE at
29 * http://www.insidesecure.com/eng/Company/Locations
30 *
31 * This program is distributed in WITHOUT ANY WARRANTY; without even the
32 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
33 * See the GNU General Public License for more details.
34 *
35 * You should have received a copy of the GNU General Public License
36 * along with this program; if not, write to the Free Software
37 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38 * http://www.gnu.org/copyleft/gpl.html
39 */
40/******************************************************************************/
41
42///bbox
43//#include "../cryptoApi.h"
44#ifndef DISABLE_PSTM
45
46/******************************************************************************/
47#if defined(PSTM_X86)
48/* x86-32 optimized for 32 bit platforms. For 64 bit mode use X86_64 instead */
49#if !defined(__GNUC__) || !defined(__i386__)
50#error "PSTM_X86 option requires GCC and 32 bit mode x86 processor"
51#endif
52//#pragma message ("Using 32 bit x86 Assembly Optimizations")
53
54#define COMBA_START
55
56#define CLEAR_CARRY \
57 c0 = c1 = c2 = 0;
58
59#define COMBA_STORE(x) \
60 x = c0;
61
62#define COMBA_STORE2(x) \
63 x = c1;
64
65#define CARRY_FORWARD \
66 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
67
68#define COMBA_FINI
69
70#define SQRADD(i, j) \
71asm( \
72 "movl %6,%%eax \n\t" \
73 "mull %%eax \n\t" \
74 "addl %%eax,%0 \n\t" \
75 "adcl %%edx,%1 \n\t" \
76 "adcl $0,%2 \n\t" \
77 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i) :"%eax","%edx","%cc");
78
79#define SQRADD2(i, j) \
80asm( \
81 "movl %6,%%eax \n\t" \
82 "mull %7 \n\t" \
83 "addl %%eax,%0 \n\t" \
84 "adcl %%edx,%1 \n\t" \
85 "adcl $0,%2 \n\t" \
86 "addl %%eax,%0 \n\t" \
87 "adcl %%edx,%1 \n\t" \
88 "adcl $0,%2 \n\t" \
89 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","%cc");
90
91#define SQRADDSC(i, j) \
92asm( \
93 "movl %6,%%eax \n\t" \
94 "mull %7 \n\t" \
95 "movl %%eax,%0 \n\t" \
96 "movl %%edx,%1 \n\t" \
97 "xorl %2,%2 \n\t" \
98 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
99
100#define SQRADDAC(i, j) \
101asm( \
102 "movl %6,%%eax \n\t" \
103 "mull %7 \n\t" \
104 "addl %%eax,%0 \n\t" \
105 "adcl %%edx,%1 \n\t" \
106 "adcl $0,%2 \n\t" \
107 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%eax","%edx","%cc");
108
109#define SQRADDDB \
110asm( \
111 "addl %6,%0 \n\t" \
112 "adcl %7,%1 \n\t" \
113 "adcl %8,%2 \n\t" \
114 "addl %6,%0 \n\t" \
115 "adcl %7,%1 \n\t" \
116 "adcl %8,%2 \n\t" \
117 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "%cc");
118
119/******************************************************************************/
120#elif defined(PSTM_X86_64)
121/* x86-64 optimized */
122#if !defined(__GNUC__) || !defined(__x86_64__) || !defined(PSTM_64BIT)
123#error "PSTM_X86_64 option requires PSTM_64BIT, GCC and 64 bit mode x86 processor"
124#endif
125//#pragma message ("Using 64 bit x86_64 Assembly Optimizations")
126
127#define COMBA_START
128
129#define CLEAR_CARRY \
130c0 = c1 = c2 = 0;
131
132#define COMBA_STORE(x) \
133x = c0;
134
135#define COMBA_STORE2(x) \
136x = c1;
137
138#define CARRY_FORWARD \
139do { c0 = c1; c1 = c2; c2 = 0; } while (0);
140
141#define COMBA_FINI
142
143#define SQRADD(i, j) \
144asm( \
145 "movq %6,%%rax \n\t" \
146 "mulq %%rax \n\t" \
147 "addq %%rax,%0 \n\t" \
148 "adcq %%rdx,%1 \n\t" \
149 "adcq $0,%2 \n\t" \
150 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i) :"%rax","%rdx","cc");
151
152#define SQRADD2(i, j) \
153asm( \
154 "movq %6,%%rax \n\t" \
155 "mulq %7 \n\t" \
156 "addq %%rax,%0 \n\t" \
157 "adcq %%rdx,%1 \n\t" \
158 "adcq $0,%2 \n\t" \
159 "addq %%rax,%0 \n\t" \
160 "adcq %%rdx,%1 \n\t" \
161 "adcq $0,%2 \n\t" \
162 :"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
163
164#define SQRADDSC(i, j) \
165asm( \
166 "movq %6,%%rax \n\t" \
167 "mulq %7 \n\t" \
168 "movq %%rax,%0 \n\t" \
169 "movq %%rdx,%1 \n\t" \
170 "xorq %2,%2 \n\t" \
171 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
172
173#define SQRADDAC(i, j) \
174asm( \
175 "movq %6,%%rax \n\t" \
176 "mulq %7 \n\t" \
177 "addq %%rax,%0 \n\t" \
178 "adcq %%rdx,%1 \n\t" \
179 "adcq $0,%2 \n\t" \
180 :"=r"(sc0), "=r"(sc1), "=r"(sc2): "0"(sc0), "1"(sc1), "2"(sc2), "g"(i), "g"(j) :"%rax","%rdx","cc");
181
182#define SQRADDDB \
183asm( \
184 "addq %6,%0 \n\t" \
185 "adcq %7,%1 \n\t" \
186 "adcq %8,%2 \n\t" \
187 "addq %6,%0 \n\t" \
188 "adcq %7,%1 \n\t" \
189 "adcq %8,%2 \n\t" \
190 :"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(sc0), "r"(sc1), "r"(sc2) : "cc");
191
192/******************************************************************************/
193#elif defined(PSTM_ARM)
194/* ARM code */
195//#pragma message ("Using 32 bit ARM Assembly Optimizations")
196
197#define COMBA_START
198
199#define CLEAR_CARRY \
200c0 = c1 = c2 = 0;
201
202#define COMBA_STORE(x) \
203x = c0;
204
205#define COMBA_STORE2(x) \
206x = c1;
207
208#define CARRY_FORWARD \
209do { c0 = c1; c1 = c2; c2 = 0; } while (0);
210
211#define COMBA_FINI
212
213/* multiplies point i and j, updates carry "c1" and digit c2 */
214#define SQRADD(i, j) \
215asm( \
216" UMULL r0,r1,%6,%6 \n\t" \
217" ADDS %0,%0,r0 \n\t" \
218" ADCS %1,%1,r1 \n\t" \
219" ADC %2,%2,#0 \n\t" \
220:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i) : "r0", "r1", "%cc");
221
222/* for squaring some of the terms are doubled... */
223#define SQRADD2(i, j) \
224asm( \
225" UMULL r0,r1,%6,%7 \n\t" \
226" ADDS %0,%0,r0 \n\t" \
227" ADCS %1,%1,r1 \n\t" \
228" ADC %2,%2,#0 \n\t" \
229" ADDS %0,%0,r0 \n\t" \
230" ADCS %1,%1,r1 \n\t" \
231" ADC %2,%2,#0 \n\t" \
232:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "%cc");
233
234#define SQRADDSC(i, j) \
235asm( \
236" UMULL %0,%1,%6,%7 \n\t" \
237" SUB %2,%2,%2 \n\t" \
238:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "%cc");
239
240#define SQRADDAC(i, j) \
241asm( \
242" UMULL r0,r1,%6,%7 \n\t" \
243" ADDS %0,%0,r0 \n\t" \
244" ADCS %1,%1,r1 \n\t" \
245" ADC %2,%2,#0 \n\t" \
246:"=r"(sc0), "=r"(sc1), "=r"(sc2) : "0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j) : "r0", "r1", "%cc");
247
248#define SQRADDDB \
249asm( \
250" ADDS %0,%0,%3 \n\t" \
251" ADCS %1,%1,%4 \n\t" \
252" ADC %2,%2,%5 \n\t" \
253" ADDS %0,%0,%3 \n\t" \
254" ADCS %1,%1,%4 \n\t" \
255" ADC %2,%2,%5 \n\t" \
256:"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "%cc");
257
258/******************************************************************************/
259#elif defined(PSTM_MIPS)
260/* MIPS32 */
261//#pragma message ("Using 32 bit MIPS Assembly Optimizations")
262
263#define COMBA_START
264
265#define CLEAR_CARRY \
266c0 = c1 = c2 = 0;
267
268#define COMBA_STORE(x) \
269x = c0;
270
271#define COMBA_STORE2(x) \
272x = c1;
273
274#define CARRY_FORWARD \
275do { c0 = c1; c1 = c2; c2 = 0; } while (0);
276
277#define COMBA_FINI
278
279/* multiplies point i and j, updates carry "c1" and digit c2 */
280#define SQRADD(i, j) \
281asm( \
282 " multu %6,%6 \n\t" \
283 " mflo $12 \n\t" \
284 " mfhi $13 \n\t" \
285 " addu %0,%0,$12 \n\t" \
286 " sltu $12,%0,$12 \n\t" \
287 " addu %1,%1,$13 \n\t" \
288 " sltu $13,%1,$13 \n\t" \
289 " addu %1,%1,$12 \n\t" \
290 " sltu $12,%1,$12 \n\t" \
291 " addu %2,%2,$13 \n\t" \
292 " addu %2,%2,$12 \n\t" \
293 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i):"$12","$13");
294
295/* for squaring some of the terms are doubled... */
296#define SQRADD2(i, j) \
297asm( \
298 " multu %6,%7 \n\t" \
299 " mflo $12 \n\t" \
300 " mfhi $13 \n\t" \
301 \
302 " addu %0,%0,$12 \n\t" \
303 " sltu $14,%0,$12 \n\t" \
304 " addu %1,%1,$13 \n\t" \
305 " sltu $15,%1,$13 \n\t" \
306 " addu %1,%1,$14 \n\t" \
307 " sltu $14,%1,$14 \n\t" \
308 " addu %2,%2,$15 \n\t" \
309 " addu %2,%2,$14 \n\t" \
310 \
311 " addu %0,%0,$12 \n\t" \
312 " sltu $14,%0,$12 \n\t" \
313 " addu %1,%1,$13 \n\t" \
314 " sltu $15,%1,$13 \n\t" \
315 " addu %1,%1,$14 \n\t" \
316 " sltu $14,%1,$14 \n\t" \
317 " addu %2,%2,$15 \n\t" \
318 " addu %2,%2,$14 \n\t" \
319 :"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12", "$13", "$14", "$15");
320
321#define SQRADDSC(i, j) \
322asm( \
323 " multu %6,%7 \n\t" \
324 " mflo %0 \n\t" \
325 " mfhi %1 \n\t" \
326 " xor %2,%2,%2 \n\t" \
327 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i),"r"(j) : "%cc");
328
329#define SQRADDAC(i, j) \
330asm( \
331 " multu %6,%7 \n\t" \
332 " mflo $12 \n\t" \
333 " mfhi $13 \n\t" \
334 " addu %0,%0,$12 \n\t" \
335 " sltu $12,%0,$12 \n\t" \
336 " addu %1,%1,$13 \n\t" \
337 " sltu $13,%1,$13 \n\t" \
338 " addu %1,%1,$12 \n\t" \
339 " sltu $12,%1,$12 \n\t" \
340 " addu %2,%2,$13 \n\t" \
341 " addu %2,%2,$12 \n\t" \
342 :"=r"(sc0), "=r"(sc1), "=r"(sc2):"0"(sc0), "1"(sc1), "2"(sc2), "r"(i), "r"(j):"$12", "$13", "$14");
343
344#define SQRADDDB \
345asm( \
346 " addu %0,%0,%3 \n\t" \
347 " sltu $10,%0,%3 \n\t" \
348 " addu %1,%1,$10 \n\t" \
349 " sltu $10,%1,$10 \n\t" \
350 " addu %1,%1,%4 \n\t" \
351 " sltu $11,%1,%4 \n\t" \
352 " addu %2,%2,$10 \n\t" \
353 " addu %2,%2,$11 \n\t" \
354 " addu %2,%2,%5 \n\t" \
355 \
356 " addu %0,%0,%3 \n\t" \
357 " sltu $10,%0,%3 \n\t" \
358 " addu %1,%1,$10 \n\t" \
359 " sltu $10,%1,$10 \n\t" \
360 " addu %1,%1,%4 \n\t" \
361 " sltu $11,%1,%4 \n\t" \
362 " addu %2,%2,$10 \n\t" \
363 " addu %2,%2,$11 \n\t" \
364 " addu %2,%2,%5 \n\t" \
365 :"=r"(c0), "=r"(c1), "=r"(c2) : "r"(sc0), "r"(sc1), "r"(sc2), "0"(c0), "1"(c1), "2"(c2) : "$10", "$11");
366
367#else
368/******************************************************************************/
369#define PSTM_ISO
370/* ISO C portable code */
371
372#define COMBA_START
373
374#define CLEAR_CARRY \
375 c0 = c1 = c2 = 0;
376
377#define COMBA_STORE(x) \
378 x = c0;
379
380#define COMBA_STORE2(x) \
381 x = c1;
382
383#define CARRY_FORWARD \
384 do { c0 = c1; c1 = c2; c2 = 0; } while (0);
385
386#define COMBA_FINI
387
388/* multiplies point i and j, updates carry "c1" and digit c2 */
389#define SQRADD(i, j) \
390 do { pstm_word t; \
391 t = c0 + ((pstm_word)i) * ((pstm_word)j); c0 = (pstm_digit)t; \
392 t = c1 + (t >> DIGIT_BIT); \
393 c1 = (pstm_digit)t; c2 += (pstm_digit)(t >> DIGIT_BIT); \
394 } while (0);
395
396
397/* for squaring some of the terms are doubled... */
398#define SQRADD2(i, j) \
399 do { pstm_word t; \
400 t = ((pstm_word)i) * ((pstm_word)j); \
401 tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
402 tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
403 c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
404 tt = (pstm_word)c0 + t; c0 = (pstm_digit)tt; \
405 tt = (pstm_word)c1 + (tt >> DIGIT_BIT); \
406 c1 = (pstm_digit)tt; c2 += (pstm_digit)(tt >> DIGIT_BIT); \
407 } while (0);
408
409#define SQRADDSC(i, j) \
410 do { pstm_word t; \
411 t = ((pstm_word)i) * ((pstm_word)j); \
412 sc0 = (pstm_digit)t; sc1 = (pstm_digit)(t >> DIGIT_BIT); sc2 = 0; \
413 } while (0);
414
415#define SQRADDAC(i, j) \
416 do { pstm_word t; \
417 t = ((pstm_word)sc0) + ((pstm_word)i) * ((pstm_word)j); \
418 sc0 = (pstm_digit)t; \
419 t = ((pstm_word)sc1) + (t >> DIGIT_BIT); sc1 = (pstm_digit)t; \
420 sc2 += (pstm_digit)(t >> DIGIT_BIT); \
421 } while (0);
422
423#define SQRADDDB \
424 do { pstm_word t; \
425 t = ((pstm_word)sc0) + ((pstm_word)sc0) + ((pstm_word)c0); \
426 c0 = (pstm_digit)t; \
427 t = ((pstm_word)sc1) + ((pstm_word)sc1) + c1 + (t >> DIGIT_BIT); \
428 c1 = (pstm_digit)t; \
429 c2 = c2 + sc2 + sc2 + (pstm_digit)(t >> DIGIT_BIT); \
430 } while (0);
431
432#endif /* ISO_C */
433
434/******************************************************************************/
435/*
436 Non-unrolled comba squarer
437 */
438///bbox: pool unused
439#define pstm_sqr_comba_gen(pool, A, B, paD, paDlen) \
440 pstm_sqr_comba_gen( A, B, paD, paDlen)
441static int32 pstm_sqr_comba_gen(psPool_t *pool, pstm_int *A, pstm_int *B,
442 pstm_digit *paD, uint32 paDlen)
443{
444 int16 paDfail, pa;
445 int32 ix, iz;
446 pstm_digit c0, c1, c2, *dst;
447#ifdef PSTM_ISO
448 pstm_word tt;
449#endif
450
451 paDfail = 0;
452 /* get size of output and trim */
453 pa = A->used + A->used;
454
455 /* number of output digits to produce */
456 COMBA_START;
457 CLEAR_CARRY;
458/*
459 If b is not large enough grow it and continue
460*/
461 if (B->alloc < pa) {
462 if (pstm_grow(B, pa) != PSTM_OKAY) {
463 return PS_MEM_FAIL;
464 }
465 }
466 if (paD != NULL) {
467 if (paDlen < (sizeof(pstm_digit) * pa)) {
468 paDfail = 1; /* have a paD, but it's not big enough */
469 dst = xzalloc(sizeof(pstm_digit) * pa);
470 } else {
471 dst = paD;
472 memset(dst, 0x0, paDlen);
473 }
474 } else {
475 dst = xzalloc(sizeof(pstm_digit) * pa);
476 }
477
478 for (ix = 0; ix < pa; ix++) {
479 int32 tx, ty, iy;
480 pstm_digit *tmpy, *tmpx;
481
482 /* get offsets into the two bignums */
483 ty = min(A->used-1, ix);
484 tx = ix - ty;
485
486 /* setup temp aliases */
487 tmpx = A->dp + tx;
488 tmpy = A->dp + ty;
489
490/*
491 This is the number of times the loop will iterate,
492 while (tx++ < a->used && ty-- >= 0) { ... }
493*/
494 iy = min(A->used-tx, ty+1);
495
496/*
497 now for squaring tx can never equal ty. We halve the distance since
498 they approach at a rate of 2x and we have to round because odd cases
499 need to be executed
500*/
501 iy = min(iy, (ty-tx+1)>>1);
502
503 /* forward carries */
504 CARRY_FORWARD;
505
506 /* execute loop */
507 for (iz = 0; iz < iy; iz++) {
508 SQRADD2(*tmpx++, *tmpy--);
509 }
510
511 /* even columns have the square term in them */
512 if ((ix&1) == 0) {
513 SQRADD(A->dp[ix>>1], A->dp[ix>>1]);
514 }
515
516 /* store it */
517 COMBA_STORE(dst[ix]);
518 }
519
520 COMBA_FINI;
521/*
522 setup dest
523 */
524 iz = B->used;
525 B->used = pa;
526 {
527 pstm_digit *tmpc;
528 tmpc = B->dp;
529 for (ix = 0; ix < pa; ix++) {
530 *tmpc++ = dst[ix];
531 }
532 /* clear unused digits (that existed in the old copy of c) */
533 for (; ix < iz; ix++) {
534 *tmpc++ = 0;
535 }
536 }
537 pstm_clamp(B);
538
539 if ((paD == NULL) || paDfail == 1) {
540 psFree(dst, pool);
541 }
542 return PS_SUCCESS;
543}
544
545/******************************************************************************/
546/*
547 Unrolled Comba loop for 1024 bit keys
548 */
549#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
550static int32 pstm_sqr_comba16(pstm_int *A, pstm_int *B)
551{
552 pstm_digit *a, b[32], c0, c1, c2, sc0, sc1, sc2;
553#ifdef PSTM_ISO
554 pstm_word tt;
555#endif
556
557 if (B->alloc < 32) {
558 if (pstm_grow(B, 32) != PSTM_OKAY) {
559 return PS_MEM_FAIL;
560 }
561 }
562 a = A->dp;
563 sc0 = sc1 = sc2 = 0;
564
565 COMBA_START;
566
567 /* clear carries */
568 CLEAR_CARRY;
569
570 /* output 0 */
571 SQRADD(a[0],a[0]);
572 COMBA_STORE(b[0]);
573
574 /* output 1 */
575 CARRY_FORWARD;
576 SQRADD2(a[0], a[1]);
577 COMBA_STORE(b[1]);
578
579 /* output 2 */
580 CARRY_FORWARD;
581 SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
582 COMBA_STORE(b[2]);
583
584 /* output 3 */
585 CARRY_FORWARD;
586 SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
587 COMBA_STORE(b[3]);
588
589 /* output 4 */
590 CARRY_FORWARD;
591 SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
592 COMBA_STORE(b[4]);
593
594 /* output 5 */
595 CARRY_FORWARD;
596 SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
597 COMBA_STORE(b[5]);
598
599 /* output 6 */
600 CARRY_FORWARD;
601 SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
602 COMBA_STORE(b[6]);
603
604 /* output 7 */
605 CARRY_FORWARD;
606 SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
607 COMBA_STORE(b[7]);
608
609 /* output 8 */
610 CARRY_FORWARD;
611 SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
612 COMBA_STORE(b[8]);
613
614 /* output 9 */
615 CARRY_FORWARD;
616 SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
617 COMBA_STORE(b[9]);
618
619 /* output 10 */
620 CARRY_FORWARD;
621 SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
622 COMBA_STORE(b[10]);
623
624 /* output 11 */
625 CARRY_FORWARD;
626 SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
627 COMBA_STORE(b[11]);
628
629 /* output 12 */
630 CARRY_FORWARD;
631 SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
632 COMBA_STORE(b[12]);
633
634 /* output 13 */
635 CARRY_FORWARD;
636 SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
637 COMBA_STORE(b[13]);
638
639 /* output 14 */
640 CARRY_FORWARD;
641 SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
642 COMBA_STORE(b[14]);
643
644 /* output 15 */
645 CARRY_FORWARD;
646 SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
647 COMBA_STORE(b[15]);
648
649 /* output 16 */
650 CARRY_FORWARD;
651 SQRADDSC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
652 COMBA_STORE(b[16]);
653
654 /* output 17 */
655 CARRY_FORWARD;
656 SQRADDSC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
657 COMBA_STORE(b[17]);
658
659 /* output 18 */
660 CARRY_FORWARD;
661 SQRADDSC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
662 COMBA_STORE(b[18]);
663
664 /* output 19 */
665 CARRY_FORWARD;
666 SQRADDSC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
667 COMBA_STORE(b[19]);
668
669 /* output 20 */
670 CARRY_FORWARD;
671 SQRADDSC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
672 COMBA_STORE(b[20]);
673
674 /* output 21 */
675 CARRY_FORWARD;
676 SQRADDSC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
677 COMBA_STORE(b[21]);
678
679 /* output 22 */
680 CARRY_FORWARD;
681 SQRADDSC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
682 COMBA_STORE(b[22]);
683
684 /* output 23 */
685 CARRY_FORWARD;
686 SQRADDSC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
687 COMBA_STORE(b[23]);
688
689 /* output 24 */
690 CARRY_FORWARD;
691 SQRADDSC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
692 COMBA_STORE(b[24]);
693
694 /* output 25 */
695 CARRY_FORWARD;
696 SQRADDSC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
697 COMBA_STORE(b[25]);
698
699 /* output 26 */
700 CARRY_FORWARD;
701 SQRADD2(a[11], a[15]); SQRADD2(a[12], a[14]); SQRADD(a[13], a[13]);
702 COMBA_STORE(b[26]);
703
704 /* output 27 */
705 CARRY_FORWARD;
706 SQRADD2(a[12], a[15]); SQRADD2(a[13], a[14]);
707 COMBA_STORE(b[27]);
708
709 /* output 28 */
710 CARRY_FORWARD;
711 SQRADD2(a[13], a[15]); SQRADD(a[14], a[14]);
712 COMBA_STORE(b[28]);
713
714 /* output 29 */
715 CARRY_FORWARD;
716 SQRADD2(a[14], a[15]);
717 COMBA_STORE(b[29]);
718
719 /* output 30 */
720 CARRY_FORWARD;
721 SQRADD(a[15], a[15]);
722 COMBA_STORE(b[30]);
723 COMBA_STORE2(b[31]);
724 COMBA_FINI;
725
726 B->used = 32;
727 B->sign = PSTM_ZPOS;
728 memcpy(B->dp, b, 32 * sizeof(pstm_digit));
729 pstm_clamp(B);
730 return PSTM_OKAY;
731}
732#endif /* USE_1024_KEY_SPEED_OPTIMIZATIONS */
733
734
735#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
736static int32 pstm_sqr_comba32(pstm_int *A, pstm_int *B)
737{
738 pstm_digit *a, b[64], c0, c1, c2, sc0, sc1, sc2;
739#ifdef PSTM_ISO
740 pstm_word tt;
741#endif
742
743 if (B->alloc < 64) {
744 if (pstm_grow(B, 64) != PSTM_OKAY) {
745 return PS_MEM_FAIL;
746 }
747 }
748 sc0 = sc1 = sc2 = 0;
749 a = A->dp;
750 COMBA_START;
751
752 /* clear carries */
753 CLEAR_CARRY;
754
755 /* output 0 */
756 SQRADD(a[0],a[0]);
757 COMBA_STORE(b[0]);
758
759 /* output 1 */
760 CARRY_FORWARD;
761 SQRADD2(a[0], a[1]);
762 COMBA_STORE(b[1]);
763
764 /* output 2 */
765 CARRY_FORWARD;
766 SQRADD2(a[0], a[2]); SQRADD(a[1], a[1]);
767 COMBA_STORE(b[2]);
768
769 /* output 3 */
770 CARRY_FORWARD;
771 SQRADD2(a[0], a[3]); SQRADD2(a[1], a[2]);
772 COMBA_STORE(b[3]);
773
774 /* output 4 */
775 CARRY_FORWARD;
776 SQRADD2(a[0], a[4]); SQRADD2(a[1], a[3]); SQRADD(a[2], a[2]);
777 COMBA_STORE(b[4]);
778
779 /* output 5 */
780 CARRY_FORWARD;
781 SQRADDSC(a[0], a[5]); SQRADDAC(a[1], a[4]); SQRADDAC(a[2], a[3]); SQRADDDB;
782 COMBA_STORE(b[5]);
783
784 /* output 6 */
785 CARRY_FORWARD;
786 SQRADDSC(a[0], a[6]); SQRADDAC(a[1], a[5]); SQRADDAC(a[2], a[4]); SQRADDDB; SQRADD(a[3], a[3]);
787 COMBA_STORE(b[6]);
788
789 /* output 7 */
790 CARRY_FORWARD;
791 SQRADDSC(a[0], a[7]); SQRADDAC(a[1], a[6]); SQRADDAC(a[2], a[5]); SQRADDAC(a[3], a[4]); SQRADDDB;
792 COMBA_STORE(b[7]);
793
794 /* output 8 */
795 CARRY_FORWARD;
796 SQRADDSC(a[0], a[8]); SQRADDAC(a[1], a[7]); SQRADDAC(a[2], a[6]); SQRADDAC(a[3], a[5]); SQRADDDB; SQRADD(a[4], a[4]);
797 COMBA_STORE(b[8]);
798
799 /* output 9 */
800 CARRY_FORWARD;
801 SQRADDSC(a[0], a[9]); SQRADDAC(a[1], a[8]); SQRADDAC(a[2], a[7]); SQRADDAC(a[3], a[6]); SQRADDAC(a[4], a[5]); SQRADDDB;
802 COMBA_STORE(b[9]);
803
804 /* output 10 */
805 CARRY_FORWARD;
806 SQRADDSC(a[0], a[10]); SQRADDAC(a[1], a[9]); SQRADDAC(a[2], a[8]); SQRADDAC(a[3], a[7]); SQRADDAC(a[4], a[6]); SQRADDDB; SQRADD(a[5], a[5]);
807 COMBA_STORE(b[10]);
808
809 /* output 11 */
810 CARRY_FORWARD;
811 SQRADDSC(a[0], a[11]); SQRADDAC(a[1], a[10]); SQRADDAC(a[2], a[9]); SQRADDAC(a[3], a[8]); SQRADDAC(a[4], a[7]); SQRADDAC(a[5], a[6]); SQRADDDB;
812 COMBA_STORE(b[11]);
813
814 /* output 12 */
815 CARRY_FORWARD;
816 SQRADDSC(a[0], a[12]); SQRADDAC(a[1], a[11]); SQRADDAC(a[2], a[10]); SQRADDAC(a[3], a[9]); SQRADDAC(a[4], a[8]); SQRADDAC(a[5], a[7]); SQRADDDB; SQRADD(a[6], a[6]);
817 COMBA_STORE(b[12]);
818
819 /* output 13 */
820 CARRY_FORWARD;
821 SQRADDSC(a[0], a[13]); SQRADDAC(a[1], a[12]); SQRADDAC(a[2], a[11]); SQRADDAC(a[3], a[10]); SQRADDAC(a[4], a[9]); SQRADDAC(a[5], a[8]); SQRADDAC(a[6], a[7]); SQRADDDB;
822 COMBA_STORE(b[13]);
823
824 /* output 14 */
825 CARRY_FORWARD;
826 SQRADDSC(a[0], a[14]); SQRADDAC(a[1], a[13]); SQRADDAC(a[2], a[12]); SQRADDAC(a[3], a[11]); SQRADDAC(a[4], a[10]); SQRADDAC(a[5], a[9]); SQRADDAC(a[6], a[8]); SQRADDDB; SQRADD(a[7], a[7]);
827 COMBA_STORE(b[14]);
828
829 /* output 15 */
830 CARRY_FORWARD;
831 SQRADDSC(a[0], a[15]); SQRADDAC(a[1], a[14]); SQRADDAC(a[2], a[13]); SQRADDAC(a[3], a[12]); SQRADDAC(a[4], a[11]); SQRADDAC(a[5], a[10]); SQRADDAC(a[6], a[9]); SQRADDAC(a[7], a[8]); SQRADDDB;
832 COMBA_STORE(b[15]);
833
834 /* output 16 */
835 CARRY_FORWARD;
836 SQRADDSC(a[0], a[16]); SQRADDAC(a[1], a[15]); SQRADDAC(a[2], a[14]); SQRADDAC(a[3], a[13]); SQRADDAC(a[4], a[12]); SQRADDAC(a[5], a[11]); SQRADDAC(a[6], a[10]); SQRADDAC(a[7], a[9]); SQRADDDB; SQRADD(a[8], a[8]);
837 COMBA_STORE(b[16]);
838
839 /* output 17 */
840 CARRY_FORWARD;
841 SQRADDSC(a[0], a[17]); SQRADDAC(a[1], a[16]); SQRADDAC(a[2], a[15]); SQRADDAC(a[3], a[14]); SQRADDAC(a[4], a[13]); SQRADDAC(a[5], a[12]); SQRADDAC(a[6], a[11]); SQRADDAC(a[7], a[10]); SQRADDAC(a[8], a[9]); SQRADDDB;
842 COMBA_STORE(b[17]);
843
844 /* output 18 */
845 CARRY_FORWARD;
846 SQRADDSC(a[0], a[18]); SQRADDAC(a[1], a[17]); SQRADDAC(a[2], a[16]); SQRADDAC(a[3], a[15]); SQRADDAC(a[4], a[14]); SQRADDAC(a[5], a[13]); SQRADDAC(a[6], a[12]); SQRADDAC(a[7], a[11]); SQRADDAC(a[8], a[10]); SQRADDDB; SQRADD(a[9], a[9]);
847 COMBA_STORE(b[18]);
848
849 /* output 19 */
850 CARRY_FORWARD;
851 SQRADDSC(a[0], a[19]); SQRADDAC(a[1], a[18]); SQRADDAC(a[2], a[17]); SQRADDAC(a[3], a[16]); SQRADDAC(a[4], a[15]); SQRADDAC(a[5], a[14]); SQRADDAC(a[6], a[13]); SQRADDAC(a[7], a[12]); SQRADDAC(a[8], a[11]); SQRADDAC(a[9], a[10]); SQRADDDB;
852 COMBA_STORE(b[19]);
853
854 /* output 20 */
855 CARRY_FORWARD;
856 SQRADDSC(a[0], a[20]); SQRADDAC(a[1], a[19]); SQRADDAC(a[2], a[18]); SQRADDAC(a[3], a[17]); SQRADDAC(a[4], a[16]); SQRADDAC(a[5], a[15]); SQRADDAC(a[6], a[14]); SQRADDAC(a[7], a[13]); SQRADDAC(a[8], a[12]); SQRADDAC(a[9], a[11]); SQRADDDB; SQRADD(a[10], a[10]);
857 COMBA_STORE(b[20]);
858
859 /* output 21 */
860 CARRY_FORWARD;
861 SQRADDSC(a[0], a[21]); SQRADDAC(a[1], a[20]); SQRADDAC(a[2], a[19]); SQRADDAC(a[3], a[18]); SQRADDAC(a[4], a[17]); SQRADDAC(a[5], a[16]); SQRADDAC(a[6], a[15]); SQRADDAC(a[7], a[14]); SQRADDAC(a[8], a[13]); SQRADDAC(a[9], a[12]); SQRADDAC(a[10], a[11]); SQRADDDB;
862 COMBA_STORE(b[21]);
863
864 /* output 22 */
865 CARRY_FORWARD;
866 SQRADDSC(a[0], a[22]); SQRADDAC(a[1], a[21]); SQRADDAC(a[2], a[20]); SQRADDAC(a[3], a[19]); SQRADDAC(a[4], a[18]); SQRADDAC(a[5], a[17]); SQRADDAC(a[6], a[16]); SQRADDAC(a[7], a[15]); SQRADDAC(a[8], a[14]); SQRADDAC(a[9], a[13]); SQRADDAC(a[10], a[12]); SQRADDDB; SQRADD(a[11], a[11]);
867 COMBA_STORE(b[22]);
868
869 /* output 23 */
870 CARRY_FORWARD;
871 SQRADDSC(a[0], a[23]); SQRADDAC(a[1], a[22]); SQRADDAC(a[2], a[21]); SQRADDAC(a[3], a[20]); SQRADDAC(a[4], a[19]); SQRADDAC(a[5], a[18]); SQRADDAC(a[6], a[17]); SQRADDAC(a[7], a[16]); SQRADDAC(a[8], a[15]); SQRADDAC(a[9], a[14]); SQRADDAC(a[10], a[13]); SQRADDAC(a[11], a[12]); SQRADDDB;
872 COMBA_STORE(b[23]);
873
874 /* output 24 */
875 CARRY_FORWARD;
876 SQRADDSC(a[0], a[24]); SQRADDAC(a[1], a[23]); SQRADDAC(a[2], a[22]); SQRADDAC(a[3], a[21]); SQRADDAC(a[4], a[20]); SQRADDAC(a[5], a[19]); SQRADDAC(a[6], a[18]); SQRADDAC(a[7], a[17]); SQRADDAC(a[8], a[16]); SQRADDAC(a[9], a[15]); SQRADDAC(a[10], a[14]); SQRADDAC(a[11], a[13]); SQRADDDB; SQRADD(a[12], a[12]);
877 COMBA_STORE(b[24]);
878
879 /* output 25 */
880 CARRY_FORWARD;
881 SQRADDSC(a[0], a[25]); SQRADDAC(a[1], a[24]); SQRADDAC(a[2], a[23]); SQRADDAC(a[3], a[22]); SQRADDAC(a[4], a[21]); SQRADDAC(a[5], a[20]); SQRADDAC(a[6], a[19]); SQRADDAC(a[7], a[18]); SQRADDAC(a[8], a[17]); SQRADDAC(a[9], a[16]); SQRADDAC(a[10], a[15]); SQRADDAC(a[11], a[14]); SQRADDAC(a[12], a[13]); SQRADDDB;
882 COMBA_STORE(b[25]);
883
884 /* output 26 */
885 CARRY_FORWARD;
886 SQRADDSC(a[0], a[26]); SQRADDAC(a[1], a[25]); SQRADDAC(a[2], a[24]); SQRADDAC(a[3], a[23]); SQRADDAC(a[4], a[22]); SQRADDAC(a[5], a[21]); SQRADDAC(a[6], a[20]); SQRADDAC(a[7], a[19]); SQRADDAC(a[8], a[18]); SQRADDAC(a[9], a[17]); SQRADDAC(a[10], a[16]); SQRADDAC(a[11], a[15]); SQRADDAC(a[12], a[14]); SQRADDDB; SQRADD(a[13], a[13]);
887 COMBA_STORE(b[26]);
888
889 /* output 27 */
890 CARRY_FORWARD;
891 SQRADDSC(a[0], a[27]); SQRADDAC(a[1], a[26]); SQRADDAC(a[2], a[25]); SQRADDAC(a[3], a[24]); SQRADDAC(a[4], a[23]); SQRADDAC(a[5], a[22]); SQRADDAC(a[6], a[21]); SQRADDAC(a[7], a[20]); SQRADDAC(a[8], a[19]); SQRADDAC(a[9], a[18]); SQRADDAC(a[10], a[17]); SQRADDAC(a[11], a[16]); SQRADDAC(a[12], a[15]); SQRADDAC(a[13], a[14]); SQRADDDB;
892 COMBA_STORE(b[27]);
893
894 /* output 28 */
895 CARRY_FORWARD;
896 SQRADDSC(a[0], a[28]); SQRADDAC(a[1], a[27]); SQRADDAC(a[2], a[26]); SQRADDAC(a[3], a[25]); SQRADDAC(a[4], a[24]); SQRADDAC(a[5], a[23]); SQRADDAC(a[6], a[22]); SQRADDAC(a[7], a[21]); SQRADDAC(a[8], a[20]); SQRADDAC(a[9], a[19]); SQRADDAC(a[10], a[18]); SQRADDAC(a[11], a[17]); SQRADDAC(a[12], a[16]); SQRADDAC(a[13], a[15]); SQRADDDB; SQRADD(a[14], a[14]);
897 COMBA_STORE(b[28]);
898
899 /* output 29 */
900 CARRY_FORWARD;
901 SQRADDSC(a[0], a[29]); SQRADDAC(a[1], a[28]); SQRADDAC(a[2], a[27]); SQRADDAC(a[3], a[26]); SQRADDAC(a[4], a[25]); SQRADDAC(a[5], a[24]); SQRADDAC(a[6], a[23]); SQRADDAC(a[7], a[22]); SQRADDAC(a[8], a[21]); SQRADDAC(a[9], a[20]); SQRADDAC(a[10], a[19]); SQRADDAC(a[11], a[18]); SQRADDAC(a[12], a[17]); SQRADDAC(a[13], a[16]); SQRADDAC(a[14], a[15]); SQRADDDB;
902 COMBA_STORE(b[29]);
903
904 /* output 30 */
905 CARRY_FORWARD;
906 SQRADDSC(a[0], a[30]); SQRADDAC(a[1], a[29]); SQRADDAC(a[2], a[28]); SQRADDAC(a[3], a[27]); SQRADDAC(a[4], a[26]); SQRADDAC(a[5], a[25]); SQRADDAC(a[6], a[24]); SQRADDAC(a[7], a[23]); SQRADDAC(a[8], a[22]); SQRADDAC(a[9], a[21]); SQRADDAC(a[10], a[20]); SQRADDAC(a[11], a[19]); SQRADDAC(a[12], a[18]); SQRADDAC(a[13], a[17]); SQRADDAC(a[14], a[16]); SQRADDDB; SQRADD(a[15], a[15]);
907 COMBA_STORE(b[30]);
908
909 /* output 31 */
910 CARRY_FORWARD;
911 SQRADDSC(a[0], a[31]); SQRADDAC(a[1], a[30]); SQRADDAC(a[2], a[29]); SQRADDAC(a[3], a[28]); SQRADDAC(a[4], a[27]); SQRADDAC(a[5], a[26]); SQRADDAC(a[6], a[25]); SQRADDAC(a[7], a[24]); SQRADDAC(a[8], a[23]); SQRADDAC(a[9], a[22]); SQRADDAC(a[10], a[21]); SQRADDAC(a[11], a[20]); SQRADDAC(a[12], a[19]); SQRADDAC(a[13], a[18]); SQRADDAC(a[14], a[17]); SQRADDAC(a[15], a[16]); SQRADDDB;
912 COMBA_STORE(b[31]);
913
914 /* output 32 */
915 CARRY_FORWARD;
916 SQRADDSC(a[1], a[31]); SQRADDAC(a[2], a[30]); SQRADDAC(a[3], a[29]); SQRADDAC(a[4], a[28]); SQRADDAC(a[5], a[27]); SQRADDAC(a[6], a[26]); SQRADDAC(a[7], a[25]); SQRADDAC(a[8], a[24]); SQRADDAC(a[9], a[23]); SQRADDAC(a[10], a[22]); SQRADDAC(a[11], a[21]); SQRADDAC(a[12], a[20]); SQRADDAC(a[13], a[19]); SQRADDAC(a[14], a[18]); SQRADDAC(a[15], a[17]); SQRADDDB; SQRADD(a[16], a[16]);
917 COMBA_STORE(b[32]);
918
919 /* output 33 */
920 CARRY_FORWARD;
921 SQRADDSC(a[2], a[31]); SQRADDAC(a[3], a[30]); SQRADDAC(a[4], a[29]); SQRADDAC(a[5], a[28]); SQRADDAC(a[6], a[27]); SQRADDAC(a[7], a[26]); SQRADDAC(a[8], a[25]); SQRADDAC(a[9], a[24]); SQRADDAC(a[10], a[23]); SQRADDAC(a[11], a[22]); SQRADDAC(a[12], a[21]); SQRADDAC(a[13], a[20]); SQRADDAC(a[14], a[19]); SQRADDAC(a[15], a[18]); SQRADDAC(a[16], a[17]); SQRADDDB;
922 COMBA_STORE(b[33]);
923
924 /* output 34 */
925 CARRY_FORWARD;
926 SQRADDSC(a[3], a[31]); SQRADDAC(a[4], a[30]); SQRADDAC(a[5], a[29]); SQRADDAC(a[6], a[28]); SQRADDAC(a[7], a[27]); SQRADDAC(a[8], a[26]); SQRADDAC(a[9], a[25]); SQRADDAC(a[10], a[24]); SQRADDAC(a[11], a[23]); SQRADDAC(a[12], a[22]); SQRADDAC(a[13], a[21]); SQRADDAC(a[14], a[20]); SQRADDAC(a[15], a[19]); SQRADDAC(a[16], a[18]); SQRADDDB; SQRADD(a[17], a[17]);
927 COMBA_STORE(b[34]);
928
929 /* output 35 */
930 CARRY_FORWARD;
931 SQRADDSC(a[4], a[31]); SQRADDAC(a[5], a[30]); SQRADDAC(a[6], a[29]); SQRADDAC(a[7], a[28]); SQRADDAC(a[8], a[27]); SQRADDAC(a[9], a[26]); SQRADDAC(a[10], a[25]); SQRADDAC(a[11], a[24]); SQRADDAC(a[12], a[23]); SQRADDAC(a[13], a[22]); SQRADDAC(a[14], a[21]); SQRADDAC(a[15], a[20]); SQRADDAC(a[16], a[19]); SQRADDAC(a[17], a[18]); SQRADDDB;
932 COMBA_STORE(b[35]);
933
934 /* output 36 */
935 CARRY_FORWARD;
936 SQRADDSC(a[5], a[31]); SQRADDAC(a[6], a[30]); SQRADDAC(a[7], a[29]); SQRADDAC(a[8], a[28]); SQRADDAC(a[9], a[27]); SQRADDAC(a[10], a[26]); SQRADDAC(a[11], a[25]); SQRADDAC(a[12], a[24]); SQRADDAC(a[13], a[23]); SQRADDAC(a[14], a[22]); SQRADDAC(a[15], a[21]); SQRADDAC(a[16], a[20]); SQRADDAC(a[17], a[19]); SQRADDDB; SQRADD(a[18], a[18]);
937 COMBA_STORE(b[36]);
938
939 /* output 37 */
940 CARRY_FORWARD;
941 SQRADDSC(a[6], a[31]); SQRADDAC(a[7], a[30]); SQRADDAC(a[8], a[29]); SQRADDAC(a[9], a[28]); SQRADDAC(a[10], a[27]); SQRADDAC(a[11], a[26]); SQRADDAC(a[12], a[25]); SQRADDAC(a[13], a[24]); SQRADDAC(a[14], a[23]); SQRADDAC(a[15], a[22]); SQRADDAC(a[16], a[21]); SQRADDAC(a[17], a[20]); SQRADDAC(a[18], a[19]); SQRADDDB;
942 COMBA_STORE(b[37]);
943
944 /* output 38 */
945 CARRY_FORWARD;
946 SQRADDSC(a[7], a[31]); SQRADDAC(a[8], a[30]); SQRADDAC(a[9], a[29]); SQRADDAC(a[10], a[28]); SQRADDAC(a[11], a[27]); SQRADDAC(a[12], a[26]); SQRADDAC(a[13], a[25]); SQRADDAC(a[14], a[24]); SQRADDAC(a[15], a[23]); SQRADDAC(a[16], a[22]); SQRADDAC(a[17], a[21]); SQRADDAC(a[18], a[20]); SQRADDDB; SQRADD(a[19], a[19]);
947 COMBA_STORE(b[38]);
948
949 /* output 39 */
950 CARRY_FORWARD;
951 SQRADDSC(a[8], a[31]); SQRADDAC(a[9], a[30]); SQRADDAC(a[10], a[29]); SQRADDAC(a[11], a[28]); SQRADDAC(a[12], a[27]); SQRADDAC(a[13], a[26]); SQRADDAC(a[14], a[25]); SQRADDAC(a[15], a[24]); SQRADDAC(a[16], a[23]); SQRADDAC(a[17], a[22]); SQRADDAC(a[18], a[21]); SQRADDAC(a[19], a[20]); SQRADDDB;
952 COMBA_STORE(b[39]);
953
954 /* output 40 */
955 CARRY_FORWARD;
956 SQRADDSC(a[9], a[31]); SQRADDAC(a[10], a[30]); SQRADDAC(a[11], a[29]); SQRADDAC(a[12], a[28]); SQRADDAC(a[13], a[27]); SQRADDAC(a[14], a[26]); SQRADDAC(a[15], a[25]); SQRADDAC(a[16], a[24]); SQRADDAC(a[17], a[23]); SQRADDAC(a[18], a[22]); SQRADDAC(a[19], a[21]); SQRADDDB; SQRADD(a[20], a[20]);
957 COMBA_STORE(b[40]);
958
959 /* output 41 */
960 CARRY_FORWARD;
961 SQRADDSC(a[10], a[31]); SQRADDAC(a[11], a[30]); SQRADDAC(a[12], a[29]); SQRADDAC(a[13], a[28]); SQRADDAC(a[14], a[27]); SQRADDAC(a[15], a[26]); SQRADDAC(a[16], a[25]); SQRADDAC(a[17], a[24]); SQRADDAC(a[18], a[23]); SQRADDAC(a[19], a[22]); SQRADDAC(a[20], a[21]); SQRADDDB;
962 COMBA_STORE(b[41]);
963
964 /* output 42 */
965 CARRY_FORWARD;
966 SQRADDSC(a[11], a[31]); SQRADDAC(a[12], a[30]); SQRADDAC(a[13], a[29]); SQRADDAC(a[14], a[28]); SQRADDAC(a[15], a[27]); SQRADDAC(a[16], a[26]); SQRADDAC(a[17], a[25]); SQRADDAC(a[18], a[24]); SQRADDAC(a[19], a[23]); SQRADDAC(a[20], a[22]); SQRADDDB; SQRADD(a[21], a[21]);
967 COMBA_STORE(b[42]);
968
969 /* output 43 */
970 CARRY_FORWARD;
971 SQRADDSC(a[12], a[31]); SQRADDAC(a[13], a[30]); SQRADDAC(a[14], a[29]); SQRADDAC(a[15], a[28]); SQRADDAC(a[16], a[27]); SQRADDAC(a[17], a[26]); SQRADDAC(a[18], a[25]); SQRADDAC(a[19], a[24]); SQRADDAC(a[20], a[23]); SQRADDAC(a[21], a[22]); SQRADDDB;
972 COMBA_STORE(b[43]);
973
974 /* output 44 */
975 CARRY_FORWARD;
976 SQRADDSC(a[13], a[31]); SQRADDAC(a[14], a[30]); SQRADDAC(a[15], a[29]); SQRADDAC(a[16], a[28]); SQRADDAC(a[17], a[27]); SQRADDAC(a[18], a[26]); SQRADDAC(a[19], a[25]); SQRADDAC(a[20], a[24]); SQRADDAC(a[21], a[23]); SQRADDDB; SQRADD(a[22], a[22]);
977 COMBA_STORE(b[44]);
978
979 /* output 45 */
980 CARRY_FORWARD;
981 SQRADDSC(a[14], a[31]); SQRADDAC(a[15], a[30]); SQRADDAC(a[16], a[29]); SQRADDAC(a[17], a[28]); SQRADDAC(a[18], a[27]); SQRADDAC(a[19], a[26]); SQRADDAC(a[20], a[25]); SQRADDAC(a[21], a[24]); SQRADDAC(a[22], a[23]); SQRADDDB;
982 COMBA_STORE(b[45]);
983
984 /* output 46 */
985 CARRY_FORWARD;
986 SQRADDSC(a[15], a[31]); SQRADDAC(a[16], a[30]); SQRADDAC(a[17], a[29]); SQRADDAC(a[18], a[28]); SQRADDAC(a[19], a[27]); SQRADDAC(a[20], a[26]); SQRADDAC(a[21], a[25]); SQRADDAC(a[22], a[24]); SQRADDDB; SQRADD(a[23], a[23]);
987 COMBA_STORE(b[46]);
988
989 /* output 47 */
990 CARRY_FORWARD;
991 SQRADDSC(a[16], a[31]); SQRADDAC(a[17], a[30]); SQRADDAC(a[18], a[29]); SQRADDAC(a[19], a[28]); SQRADDAC(a[20], a[27]); SQRADDAC(a[21], a[26]); SQRADDAC(a[22], a[25]); SQRADDAC(a[23], a[24]); SQRADDDB;
992 COMBA_STORE(b[47]);
993
994 /* output 48 */
995 CARRY_FORWARD;
996 SQRADDSC(a[17], a[31]); SQRADDAC(a[18], a[30]); SQRADDAC(a[19], a[29]); SQRADDAC(a[20], a[28]); SQRADDAC(a[21], a[27]); SQRADDAC(a[22], a[26]); SQRADDAC(a[23], a[25]); SQRADDDB; SQRADD(a[24], a[24]);
997 COMBA_STORE(b[48]);
998
999 /* output 49 */
1000 CARRY_FORWARD;
1001 SQRADDSC(a[18], a[31]); SQRADDAC(a[19], a[30]); SQRADDAC(a[20], a[29]); SQRADDAC(a[21], a[28]); SQRADDAC(a[22], a[27]); SQRADDAC(a[23], a[26]); SQRADDAC(a[24], a[25]); SQRADDDB;
1002 COMBA_STORE(b[49]);
1003
1004 /* output 50 */
1005 CARRY_FORWARD;
1006 SQRADDSC(a[19], a[31]); SQRADDAC(a[20], a[30]); SQRADDAC(a[21], a[29]); SQRADDAC(a[22], a[28]); SQRADDAC(a[23], a[27]); SQRADDAC(a[24], a[26]); SQRADDDB; SQRADD(a[25], a[25]);
1007 COMBA_STORE(b[50]);
1008
1009 /* output 51 */
1010 CARRY_FORWARD;
1011 SQRADDSC(a[20], a[31]); SQRADDAC(a[21], a[30]); SQRADDAC(a[22], a[29]); SQRADDAC(a[23], a[28]); SQRADDAC(a[24], a[27]); SQRADDAC(a[25], a[26]); SQRADDDB;
1012 COMBA_STORE(b[51]);
1013
1014 /* output 52 */
1015 CARRY_FORWARD;
1016 SQRADDSC(a[21], a[31]); SQRADDAC(a[22], a[30]); SQRADDAC(a[23], a[29]); SQRADDAC(a[24], a[28]); SQRADDAC(a[25], a[27]); SQRADDDB; SQRADD(a[26], a[26]);
1017 COMBA_STORE(b[52]);
1018
1019 /* output 53 */
1020 CARRY_FORWARD;
1021 SQRADDSC(a[22], a[31]); SQRADDAC(a[23], a[30]); SQRADDAC(a[24], a[29]); SQRADDAC(a[25], a[28]); SQRADDAC(a[26], a[27]); SQRADDDB;
1022 COMBA_STORE(b[53]);
1023
1024 /* output 54 */
1025 CARRY_FORWARD;
1026 SQRADDSC(a[23], a[31]); SQRADDAC(a[24], a[30]); SQRADDAC(a[25], a[29]); SQRADDAC(a[26], a[28]); SQRADDDB; SQRADD(a[27], a[27]);
1027 COMBA_STORE(b[54]);
1028
1029 /* output 55 */
1030 CARRY_FORWARD;
1031 SQRADDSC(a[24], a[31]); SQRADDAC(a[25], a[30]); SQRADDAC(a[26], a[29]); SQRADDAC(a[27], a[28]); SQRADDDB;
1032 COMBA_STORE(b[55]);
1033
1034 /* output 56 */
1035 CARRY_FORWARD;
1036 SQRADDSC(a[25], a[31]); SQRADDAC(a[26], a[30]); SQRADDAC(a[27], a[29]); SQRADDDB; SQRADD(a[28], a[28]);
1037 COMBA_STORE(b[56]);
1038
1039 /* output 57 */
1040 CARRY_FORWARD;
1041 SQRADDSC(a[26], a[31]); SQRADDAC(a[27], a[30]); SQRADDAC(a[28], a[29]); SQRADDDB;
1042 COMBA_STORE(b[57]);
1043
1044 /* output 58 */
1045 CARRY_FORWARD;
1046 SQRADD2(a[27], a[31]); SQRADD2(a[28], a[30]); SQRADD(a[29], a[29]);
1047 COMBA_STORE(b[58]);
1048
1049 /* output 59 */
1050 CARRY_FORWARD;
1051 SQRADD2(a[28], a[31]); SQRADD2(a[29], a[30]);
1052 COMBA_STORE(b[59]);
1053
1054 /* output 60 */
1055 CARRY_FORWARD;
1056 SQRADD2(a[29], a[31]); SQRADD(a[30], a[30]);
1057 COMBA_STORE(b[60]);
1058
1059 /* output 61 */
1060 CARRY_FORWARD;
1061 SQRADD2(a[30], a[31]);
1062 COMBA_STORE(b[61]);
1063
1064 /* output 62 */
1065 CARRY_FORWARD;
1066 SQRADD(a[31], a[31]);
1067 COMBA_STORE(b[62]);
1068 COMBA_STORE2(b[63]);
1069 COMBA_FINI;
1070
1071 B->used = 64;
1072 B->sign = PSTM_ZPOS;
1073 memcpy(B->dp, b, 64 * sizeof(pstm_digit));
1074 pstm_clamp(B);
1075 return PSTM_OKAY;
1076}
1077#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1078
1079/******************************************************************************/
1080/*
1081 */
1082int32 pstm_sqr_comba(psPool_t *pool, pstm_int *A, pstm_int *B, pstm_digit *paD,
1083 uint32 paDlen)
1084{
1085#ifdef USE_1024_KEY_SPEED_OPTIMIZATIONS
1086 if (A->used == 16) {
1087 return pstm_sqr_comba16(A, B);
1088 } else {
1089#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1090 if (A->used == 32) {
1091 return pstm_sqr_comba32(A, B);
1092 }
1093#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1094 return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1095 }
1096#else
1097#ifdef USE_2048_KEY_SPEED_OPTIMIZATIONS
1098 if (A->used == 32) {
1099 return pstm_sqr_comba32(A, B);
1100 }
1101#endif /* USE_2048_KEY_SPEED_OPTIMIZATIONS */
1102 return pstm_sqr_comba_gen(pool, A, B, paD, paDlen);
1103#endif
1104}
1105
1106#endif /* DISABLE_PSTM */
1107/******************************************************************************/