From a95585a25ab25668b931a78b7543f707a3354db8 Mon Sep 17 00:00:00 2001 From: djm <> Date: Fri, 29 Apr 2005 05:37:34 +0000 Subject: import of openssl-0.9.7g; tested on platforms from alpha to zaurus, ok deraadt@ --- src/lib/libcrypto/bn/asm/ia64.S | 217 ++-- src/lib/libcrypto/bn/asm/ppc.pl | 2081 +++++++++++++++++++++++++++++++++ src/lib/libcrypto/bn/asm/x86_64-gcc.c | 54 +- 3 files changed, 2203 insertions(+), 149 deletions(-) create mode 100644 src/lib/libcrypto/bn/asm/ppc.pl (limited to 'src/lib/libcrypto/bn/asm') diff --git a/src/lib/libcrypto/bn/asm/ia64.S b/src/lib/libcrypto/bn/asm/ia64.S index 7dfda85566..7b82b820e6 100644 --- a/src/lib/libcrypto/bn/asm/ia64.S +++ b/src/lib/libcrypto/bn/asm/ia64.S @@ -1,6 +1,6 @@ .explicit .text -.ident "ia64.S, Version 2.0" +.ident "ia64.S, Version 2.1" .ident "IA-64 ISA artwork by Andy Polyakov " // @@ -35,7 +35,7 @@ // What does it mean? You might ratiocinate that the original code // should run just faster... Because sum of latencies is smaller... // Wrong! Note that getf latency increased. This means that if a loop is -// scheduled for lower latency (and they are), then it will suffer from +// scheduled for lower latency (as they were), then it will suffer from // stall condition and the code will therefore turn anti-scalable, e.g. // original bn_mul_words spun at 5*n or 2.5 times slower than expected // on Itanium2! What to do? Reschedule loops for Itanium2? But then @@ -145,6 +145,12 @@ // -Drum=nop.m in command line. // +#if defined(_HPUX_SOURCE) && !defined(_LP64) +#define ADDP addp4 +#else +#define ADDP add +#endif + #if 1 // // bn_[add|sub]_words routines. @@ -178,27 +184,12 @@ bn_add_words: brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16 } .body -{ .mib; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r14=0,r32 // rp -#else - mov r14=r32 // rp -#endif +{ .mib; ADDP r14=0,r32 // rp mov r9=pr };; -{ .mii; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r15=0,r33 // ap -#else - mov r15=r33 // ap -#endif +{ .mii; ADDP r15=0,r33 // ap mov ar.lc=r10 mov ar.ec=6 } -{ .mib; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r16=0,r34 // bp -#else - mov r16=r34 // bp -#endif +{ .mib; ADDP r16=0,r34 // bp mov pr.rot=1<<16 };; .L_bn_add_words_ctop: @@ -246,27 +237,12 @@ bn_sub_words: brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16 } .body -{ .mib; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r14=0,r32 // rp -#else - mov r14=r32 // rp -#endif +{ .mib; ADDP r14=0,r32 // rp mov r9=pr };; -{ .mii; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r15=0,r33 // ap -#else - mov r15=r33 // ap -#endif +{ .mii; ADDP r15=0,r33 // ap mov ar.lc=r10 mov ar.ec=6 } -{ .mib; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r16=0,r34 // bp -#else - mov r16=r34 // bp -#endif +{ .mib; ADDP r16=0,r34 // bp mov pr.rot=1<<16 };; .L_bn_sub_words_ctop: @@ -332,16 +308,10 @@ bn_mul_words: #ifndef XMA_TEMPTATION -{ .mii; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r14=0,r32 // rp - addp4 r15=0,r33 // ap -#else - mov r14=r32 // rp - mov r15=r33 // ap -#endif +{ .mmi; ADDP r14=0,r32 // rp + ADDP r15=0,r33 // ap mov ar.lc=r10 } -{ .mii; mov r40=0 // serves as r35 at first (p27) +{ .mmi; mov r40=0 // serves as r35 at first (p27) mov ar.ec=13 };; // This loop spins in 2*(n+12) ticks. It's scheduled for data in Itanium @@ -424,89 +394,64 @@ bn_mul_words: .global bn_mul_add_words# .proc bn_mul_add_words# .align 64 -//.skip 0 // makes the loop split at 64-byte boundary +.skip 48 // makes the loop body aligned at 64-byte boundary bn_mul_add_words: .prologue .fframe 0 .save ar.pfs,r2 -{ .mii; alloc r2=ar.pfs,4,12,0,16 - cmp4.le p6,p0=r34,r0 };; -{ .mfb; mov r8=r0 // return value -(p6) br.ret.spnt.many b0 };; - .save ar.lc,r3 -{ .mii; sub r10=r34,r0,1 - mov r3=ar.lc - mov r9=pr };; + .save pr,r9 +{ .mmi; alloc r2=ar.pfs,4,4,0,8 + cmp4.le p6,p0=r34,r0 + mov r3=ar.lc };; +{ .mib; mov r8=r0 // return value + sub r10=r34,r0,1 +(p6) br.ret.spnt.many b0 };; .body -{ .mib; setf.sig f8=r35 // w - mov pr.rot=0x800001<<16 - // ------^----- serves as (p50) at first (p27) +{ .mib; setf.sig f8=r35 // w + mov r9=pr brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16 } -{ .mii; -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r14=0,r32 // rp - addp4 r15=0,r33 // ap -#else - mov r14=r32 // rp - mov r15=r33 // ap -#endif +{ .mmi; ADDP r14=0,r32 // rp + ADDP r15=0,r33 // ap mov ar.lc=r10 } -{ .mii; mov r40=0 // serves as r35 at first (p27) -#if defined(_HPUX_SOURCE) && defined(_ILP32) - addp4 r18=0,r32 // rp copy -#else - mov r18=r32 // rp copy -#endif - mov ar.ec=15 };; - -// This loop spins in 3*(n+14) ticks on Itanium and should spin in -// 2*(n+14) on "wider" IA-64 implementations (to be verified with new -// µ-architecture manuals as they become available). As usual it's -// possible to compress the epilogue, down to 10 in this case, at the -// cost of scalability. Compressed (and therefore non-scalable) loop -// running at 3*(n+11) would buy you ~10% on Itanium but take ~35% -// from "wider" IA-64 so let it be scalable! Special attention was -// paid for having the loop body split at 64-byte boundary. ld8 is -// scheduled for L1 cache as the data is more than likely there. -// Indeed, bn_mul_words has put it there a moment ago:-) +{ .mii; ADDP r16=0,r32 // rp copy + mov pr.rot=0x2001<<16 + // ------^----- serves as (p40) at first (p27) + mov ar.ec=11 };; + +// This loop spins in 3*(n+10) ticks on Itanium and in 2*(n+10) on +// Itanium 2. Yes, unlike previous versions it scales:-) Previous +// version was peforming *all* additions in IALU and was starving +// for those even on Itanium 2. In this version one addition is +// moved to FPU and is folded with multiplication. This is at cost +// of propogating the result from previous call to this subroutine +// to L2 cache... In other words negligible even for shorter keys. +// *Overall* performance improvement [over previous version] varies +// from 11 to 22 percent depending on key length. .L_bn_mul_add_words_ctop: -{ .mfi; (p25) getf.sig r36=f52 // low - (p21) xmpy.lu f48=f37,f8 - (p28) cmp.ltu p54,p50=r41,r39 } -{ .mfi; (p16) ldf8 f32=[r15],8 - (p21) xmpy.hu f40=f37,f8 - (p28) add r45=r45,r41 };; -{ .mii; (p25) getf.sig r32=f44 // high - .pred.rel "mutex",p50,p54 - (p50) add r40=r38,r35 // (p27) - (p54) add r40=r38,r35,1 } // (p27) -{ .mfb; (p28) cmp.ltu.unc p60,p0=r45,r41 - (p0) nop.f 0x0 - (p0) nop.b 0x0 } -{ .mii; (p27) ld8 r44=[r18],8 - (p62) cmp.eq.or p61,p0=-1,r46 - (p62) add r46=1,r46 } -{ .mfb; (p30) st8 [r14]=r47,8 - (p0) nop.f 0x0 +.pred.rel "mutex",p40,p42 +{ .mfi; (p23) getf.sig r36=f45 // low + (p20) xma.lu f42=f36,f8,f50 // low + (p40) add r39=r39,r35 } // (p27) +{ .mfi; (p16) ldf8 f32=[r15],8 // *(ap++) + (p20) xma.hu f36=f36,f8,f50 // high + (p42) add r39=r39,r35,1 };; // (p27) +{ .mmi; (p24) getf.sig r32=f40 // high + (p16) ldf8 f46=[r16],8 // *(rp1++) + (p40) cmp.ltu p41,p39=r39,r35 } // (p27) +{ .mib; (p26) st8 [r14]=r39,8 // *(rp2++) + (p42) cmp.leu p41,p39=r39,r35 // (p27) br.ctop.sptk .L_bn_mul_add_words_ctop};; .L_bn_mul_add_words_cend: -{ .mii; nop.m 0x0 -.pred.rel "mutex",p53,p57 -(p53) add r8=r38,r0 -(p57) add r8=r38,r0,1 } -{ .mfb; nop.m 0x0 - nop.f 0x0 - nop.b 0x0 };; -{ .mii; -(p63) add r8=1,r8 - mov pr=r9,0x1ffff - mov ar.lc=r3 } -{ .mfb; rum 1<<5 // clear um.mfh - nop.f 0x0 +{ .mmi; .pred.rel "mutex",p40,p42 +(p40) add r8=r35,r0 +(p42) add r8=r35,r0,1 + mov pr=r9,0x1ffff } +{ .mib; rum 1<<5 // clear um.mfh + mov ar.lc=r3 br.ret.sptk.many b0 };; .endp bn_mul_add_words# #endif @@ -527,7 +472,8 @@ bn_sqr_words: sxt4 r34=r34 };; { .mii; cmp.le p6,p0=r34,r0 mov r8=r0 } // return value -{ .mfb; nop.f 0x0 +{ .mfb; ADDP r32=0,r32 + nop.f 0x0 (p6) br.ret.spnt.many b0 };; .save ar.lc,r3 @@ -536,11 +482,7 @@ bn_sqr_words: mov r9=pr };; .body -#if defined(_HPUX_SOURCE) && defined(_ILP32) -{ .mii; addp4 r32=0,r32 - addp4 r33=0,r33 };; -#endif -{ .mib; +{ .mib; ADDP r33=0,r33 mov pr.rot=1<<16 brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16 } @@ -605,7 +547,7 @@ bn_sqr_comba8: .prologue .fframe 0 .save ar.pfs,r2 -#if defined(_HPUX_SOURCE) && defined(_ILP32) +#if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,2,1,0,0 addp4 r33=0,r33 addp4 r32=0,r32 };; @@ -631,6 +573,10 @@ bn_sqr_comba8: // clause in Itanium µ-architecture manual? Comments are welcomed and // highly appreciated. // +// On Itanium 2 it takes ~190 ticks. This is because of stalls on +// result from getf.sig. I do nothing about it at this point for +// reasons depicted below. +// // However! It should be noted that even 160 ticks is darn good result // as it's over 10 (yes, ten, spelled as t-e-n) times faster than the // C version (compiled with gcc with inline assembler). I really @@ -673,7 +619,7 @@ bn_mul_comba8: .prologue .fframe 0 .save ar.pfs,r2 -#if defined(_HPUX_SOURCE) && defined(_ILP32) +#if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,3,0,0,0 addp4 r33=0,r33 addp4 r34=0,r34 };; @@ -1231,7 +1177,7 @@ bn_sqr_comba4: .prologue .fframe 0 .save ar.pfs,r2 -#if defined(_HPUX_SOURCE) && defined(_ILP32) +#if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,2,1,0,0 addp4 r32=0,r32 addp4 r33=0,r33 };; @@ -1264,7 +1210,7 @@ bn_mul_comba4: .prologue .fframe 0 .save ar.pfs,r2 -#if defined(_HPUX_SOURCE) && defined(_ILP32) +#if defined(_HPUX_SOURCE) && !defined(_LP64) { .mii; alloc r2=ar.pfs,3,0,0,0 addp4 r33=0,r33 addp4 r34=0,r34 };; @@ -1448,8 +1394,8 @@ bn_mul_comba4: #define I r21 #if 0 -// Some preprocessors (most notably HP-UX) apper to be allergic to -// macros enclosed to parenthesis as these three will be. +// Some preprocessors (most notably HP-UX) appear to be allergic to +// macros enclosed to parenthesis [as these three were]. #define cont p16 #define break p0 // p20 #define equ p24 @@ -1581,9 +1527,18 @@ bn_div_words: // output: f8 = (int)(a/b) // clobbered: f8,f9,f10,f11,pred pred=p15 -// This procedure is essentially Intel code and therefore is -// copyrighted to Intel Corporation (I suppose...). It's sligtly -// modified for specific needs. +// One can argue that this snippet is copyrighted to Intel +// Corporation, as it's essentially identical to one of those +// found in "Divide, Square Root and Remainder" section at +// http://www.intel.com/software/products/opensource/libraries/num.htm. +// Yes, I admit that the referred code was used as template, +// but after I realized that there hardly is any other instruction +// sequence which would perform this operation. I mean I figure that +// any independent attempt to implement high-performance division +// will result in code virtually identical to the Intel code. It +// should be noted though that below division kernel is 1 cycle +// faster than Intel one (note commented splits:-), not to mention +// original prologue (rather lack of one) and epilogue. .align 32 .skip 16 .L_udiv64_32_b6: diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl new file mode 100644 index 0000000000..307c7ccb35 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ppc.pl @@ -0,0 +1,2081 @@ +#!/usr/bin/env perl +# +# Implemented as a Perl wrapper as we want to support several different +# architectures with single file. We pick up the target based on the +# file name we are asked to generate. +# +# It should be noted though that this perl code is nothing like +# /crypto/perlasm/x86*. In this case perl is used pretty much +# as pre-processor to cover for platform differences in name decoration, +# linker tables, 32-/64-bit instruction sets... +# +# As you might know there're several PowerPC ABI in use. Most notably +# Linux and AIX use different 32-bit ABIs. Good news are that these ABIs +# are similar enough to implement leaf(!) functions, which would be ABI +# neutral. And that's what you find here: ABI neutral leaf functions. +# In case you wonder what that is... +# +# AIX performance +# +# MEASUREMENTS WITH cc ON a 200 MhZ PowerPC 604e. +# +# The following is the performance of 32-bit compiler +# generated code: +# +# OpenSSL 0.9.6c 21 dec 2001 +# built on: Tue Jun 11 11:06:51 EDT 2002 +# options:bn(64,32) ... +#compiler: cc -DTHREADS -DAIX -DB_ENDIAN -DBN_LLONG -O3 +# sign verify sign/s verify/s +#rsa 512 bits 0.0098s 0.0009s 102.0 1170.6 +#rsa 1024 bits 0.0507s 0.0026s 19.7 387.5 +#rsa 2048 bits 0.3036s 0.0085s 3.3 117.1 +#rsa 4096 bits 2.0040s 0.0299s 0.5 33.4 +#dsa 512 bits 0.0087s 0.0106s 114.3 94.5 +#dsa 1024 bits 0.0256s 0.0313s 39.0 32.0 +# +# Same bechmark with this assembler code: +# +#rsa 512 bits 0.0056s 0.0005s 178.6 2049.2 +#rsa 1024 bits 0.0283s 0.0015s 35.3 674.1 +#rsa 2048 bits 0.1744s 0.0050s 5.7 201.2 +#rsa 4096 bits 1.1644s 0.0179s 0.9 55.7 +#dsa 512 bits 0.0052s 0.0062s 191.6 162.0 +#dsa 1024 bits 0.0149s 0.0180s 67.0 55.5 +# +# Number of operations increases by at almost 75% +# +# Here are performance numbers for 64-bit compiler +# generated code: +# +# OpenSSL 0.9.6g [engine] 9 Aug 2002 +# built on: Fri Apr 18 16:59:20 EDT 2003 +# options:bn(64,64) ... +# compiler: cc -DTHREADS -D_REENTRANT -q64 -DB_ENDIAN -O3 +# sign verify sign/s verify/s +#rsa 512 bits 0.0028s 0.0003s 357.1 3844.4 +#rsa 1024 bits 0.0148s 0.0008s 67.5 1239.7 +#rsa 2048 bits 0.0963s 0.0028s 10.4 353.0 +#rsa 4096 bits 0.6538s 0.0102s 1.5 98.1 +#dsa 512 bits 0.0026s 0.0032s 382.5 313.7 +#dsa 1024 bits 0.0081s 0.0099s 122.8 100.6 +# +# Same benchmark with this assembler code: +# +#rsa 512 bits 0.0020s 0.0002s 510.4 6273.7 +#rsa 1024 bits 0.0088s 0.0005s 114.1 2128.3 +#rsa 2048 bits 0.0540s 0.0016s 18.5 622.5 +#rsa 4096 bits 0.3700s 0.0058s 2.7 171.0 +#dsa 512 bits 0.0016s 0.0020s 610.7 507.1 +#dsa 1024 bits 0.0047s 0.0058s 212.5 173.2 +# +# Again, performance increases by at about 75% +# +# Mac OS X, Apple G5 1.8GHz (Note this is 32 bit code) +# OpenSSL 0.9.7c 30 Sep 2003 +# +# Original code. +# +#rsa 512 bits 0.0011s 0.0001s 906.1 11012.5 +#rsa 1024 bits 0.0060s 0.0003s 166.6 3363.1 +#rsa 2048 bits 0.0370s 0.0010s 27.1 982.4 +#rsa 4096 bits 0.2426s 0.0036s 4.1 280.4 +#dsa 512 bits 0.0010s 0.0012s 1038.1 841.5 +#dsa 1024 bits 0.0030s 0.0037s 329.6 269.7 +#dsa 2048 bits 0.0101s 0.0127s 98.9 78.6 +# +# Same benchmark with this assembler code: +# +#rsa 512 bits 0.0007s 0.0001s 1416.2 16645.9 +#rsa 1024 bits 0.0036s 0.0002s 274.4 5380.6 +#rsa 2048 bits 0.0222s 0.0006s 45.1 1589.5 +#rsa 4096 bits 0.1469s 0.0022s 6.8 449.6 +#dsa 512 bits 0.0006s 0.0007s 1664.2 1376.2 +#dsa 1024 bits 0.0018s 0.0023s 545.0 442.2 +#dsa 2048 bits 0.0061s 0.0075s 163.5 132.8 +# +# Performance increase of ~60% +# +# If you have comments or suggestions to improve code send +# me a note at schari@us.ibm.com +# + +$opf = shift; + +if ($opf =~ /32\.s/) { + $BITS= 32; + $BNSZ= $BITS/8; + $ISA= "\"ppc\""; + + $LD= "lwz"; # load + $LDU= "lwzu"; # load and update + $ST= "stw"; # store + $STU= "stwu"; # store and update + $UMULL= "mullw"; # unsigned multiply low + $UMULH= "mulhwu"; # unsigned multiply high + $UDIV= "divwu"; # unsigned divide + $UCMPI= "cmplwi"; # unsigned compare with immediate + $UCMP= "cmplw"; # unsigned compare + $COUNTZ="cntlzw"; # count leading zeros + $SHL= "slw"; # shift left + $SHR= "srw"; # unsigned shift right + $SHRI= "srwi"; # unsigned shift right by immediate + $SHLI= "slwi"; # shift left by immediate + $CLRU= "clrlwi"; # clear upper bits + $INSR= "insrwi"; # insert right + $ROTL= "rotlwi"; # rotate left by immediate +} elsif ($opf =~ /64\.s/) { + $BITS= 64; + $BNSZ= $BITS/8; + $ISA= "\"ppc64\""; + + # same as above, but 64-bit mnemonics... + $LD= "ld"; # load + $LDU= "ldu"; # load and update + $ST= "std"; # store + $STU= "stdu"; # store and update + $UMULL= "mulld"; # unsigned multiply low + $UMULH= "mulhdu"; # unsigned multiply high + $UDIV= "divdu"; # unsigned divide + $UCMPI= "cmpldi"; # unsigned compare with immediate + $UCMP= "cmpld"; # unsigned compare + $COUNTZ="cntlzd"; # count leading zeros + $SHL= "sld"; # shift left + $SHR= "srd"; # unsigned shift right + $SHRI= "srdi"; # unsigned shift right by immediate + $SHLI= "sldi"; # shift left by immediate + $CLRU= "clrldi"; # clear upper bits + $INSR= "insrdi"; # insert right + $ROTL= "rotldi"; # rotate left by immediate +} else { die "nonsense $opf"; } + +( defined shift || open STDOUT,">$opf" ) || die "can't open $opf: $!"; + +# function entry points from the AIX code +# +# There are other, more elegant, ways to handle this. We (IBM) chose +# this approach as it plays well with scripts we run to 'namespace' +# OpenSSL .i.e. we add a prefix to all the public symbols so we can +# co-exist in the same process with other implementations of OpenSSL. +# 'cleverer' ways of doing these substitutions tend to hide data we +# need to be obvious. +# +my @items = ("bn_sqr_comba4", + "bn_sqr_comba8", + "bn_mul_comba4", + "bn_mul_comba8", + "bn_sub_words", + "bn_add_words", + "bn_div_words", + "bn_sqr_words", + "bn_mul_words", + "bn_mul_add_words"); + +if ($opf =~ /linux/) { do_linux(); } +elsif ($opf =~ /aix/) { do_aix(); } +elsif ($opf =~ /osx/) { do_osx(); } +else { do_bsd(); } + +sub do_linux { + $d=&data(); + + if ($BITS==64) { + foreach $t (@items) { + $d =~ s/\.$t:/\ +\t.section\t".opd","aw"\ +\t.align\t3\ +\t.globl\t$t\ +$t:\ +\t.quad\t.$t,.TOC.\@tocbase,0\ +\t.size\t$t,24\ +\t.previous\n\ +\t.type\t.$t,\@function\ +\t.globl\t.$t\ +.$t:/g; + } + } + else { + foreach $t (@items) { + $d=~s/\.$t/$t/g; + } + } + # hide internal labels to avoid pollution of name table... + $d=~s/Lppcasm_/.Lppcasm_/gm; + print $d; +} + +sub do_aix { + # AIX assembler is smart enough to please the linker without + # making us do something special... + print &data(); +} + +# MacOSX 32 bit +sub do_osx { + $d=&data(); + # Change the bn symbol prefix from '.' to '_' + foreach $t (@items) { + $d=~s/\.$t/_$t/g; + } + # Change .machine to something OS X asm will accept + $d=~s/\.machine.*/.text/g; + $d=~s/\#/;/g; # change comment from '#' to ';' + print $d; +} + +# BSD (Untested) +sub do_bsd { + $d=&data(); + foreach $t (@items) { + $d=~s/\.$t/_$t/g; + } + print $d; +} + +sub data { + local($data)=< 0 then result !=0 + # In either case carry bit is set. + bc BO_IF,CR0_EQ,Lppcasm_sub_adios + addi r4,r4,-$BNSZ + addi r3,r3,-$BNSZ + addi r5,r5,-$BNSZ + mtctr r6 +Lppcasm_sub_mainloop: + $LDU r7,$BNSZ(r4) + $LDU r8,$BNSZ(r5) + subfe r6,r8,r7 # r6 = r7+carry bit + onescomplement(r8) + # if carry = 1 this is r7-r8. Else it + # is r7-r8 -1 as we need. + $STU r6,$BNSZ(r3) + bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_sub_mainloop +Lppcasm_sub_adios: + subfze r3,r0 # if carry bit is set then r3 = 0 else -1 + andi. r3,r3,1 # keep only last bit. + bclr BO_ALWAYS,CR0_LT + .long 0x00000000 + + +# +# NOTE: The following label name should be changed to +# "bn_add_words" i.e. remove the first dot +# for the gcc compiler. This should be automatically +# done in the build +# + +.align 4 +.bn_add_words: +# +# Handcoded version of bn_add_words +# +#BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) +# +# r3 = r +# r4 = a +# r5 = b +# r6 = n +# +# Note: No loop unrolling done since this is not a performance +# critical loop. + + xor r0,r0,r0 +# +# check for r6 = 0. Is this needed? +# + addic. r6,r6,0 #test r6 and clear carry bit. + bc BO_IF,CR0_EQ,Lppcasm_add_adios + addi r4,r4,-$BNSZ + addi r3,r3,-$BNSZ + addi r5,r5,-$BNSZ + mtctr r6 +Lppcasm_add_mainloop: + $LDU r7,$BNSZ(r4) + $LDU r8,$BNSZ(r5) + adde r8,r7,r8 + $STU r8,$BNSZ(r3) + bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_add_mainloop +Lppcasm_add_adios: + addze r3,r0 #return carry bit. + bclr BO_ALWAYS,CR0_LT + .long 0x00000000 + +# +# NOTE: The following label name should be changed to +# "bn_div_words" i.e. remove the first dot +# for the gcc compiler. This should be automatically +# done in the build +# + +.align 4 +.bn_div_words: +# +# This is a cleaned up version of code generated by +# the AIX compiler. The only optimization is to use +# the PPC instruction to count leading zeros instead +# of call to num_bits_word. Since this was compiled +# only at level -O2 we can possibly squeeze it more? +# +# r3 = h +# r4 = l +# r5 = d + + $UCMPI 0,r5,0 # compare r5 and 0 + bc BO_IF_NOT,CR0_EQ,Lppcasm_div1 # proceed if d!=0 + li r3,-1 # d=0 return -1 + bclr BO_ALWAYS,CR0_LT +Lppcasm_div1: + xor r0,r0,r0 #r0=0 + $COUNTZ r7,r5 #r7 = num leading 0s in d. + subfic r8,r7,$BITS #r8 = BN_num_bits_word(d) + cmpi 0,0,r8,$BITS # + bc BO_IF,CR0_EQ,Lppcasm_div2 #proceed if (r8==$BITS) + li r9,1 # r9=1 + $SHL r10,r9,r8 # r9<<=r8 + $UCMP 0,r3,r10 # + bc BO_IF,CR0_GT,Lppcasm_div2 #or if (h > (1<=d? + bc BO_IF,CR0_LT,Lppcasm_div3 #goto Lppcasm_div3 if not + subf r3,r5,r3 #h-=d ; +Lppcasm_div3: #r7 = BN_BITS2-i. so r7=i + cmpi 0,0,r7,0 # is (i == 0)? + bc BO_IF,CR0_EQ,Lppcasm_div4 + $SHL r3,r3,r7 # h = (h<< i) + $SHR r8,r4,r8 # r8 = (l >> BN_BITS2 -i) + $SHL r5,r5,r7 # d<<=i + or r3,r3,r8 # h = (h<>(BN_BITS2-i)) + $SHL r4,r4,r7 # l <<=i +Lppcasm_div4: + $SHRI r9,r5,`$BITS/2` # r9 = dh + # dl will be computed when needed + # as it saves registers. + li r6,2 #r6=2 + mtctr r6 #counter will be in count. +Lppcasm_divouterloop: + $SHRI r8,r3,`$BITS/2` #r8 = (h>>BN_BITS4) + $SHRI r11,r4,`$BITS/2` #r11= (l&BN_MASK2h)>>BN_BITS4 + # compute here for innerloop. + $UCMP 0,r8,r9 # is (h>>BN_BITS4)==dh + bc BO_IF_NOT,CR0_EQ,Lppcasm_div5 # goto Lppcasm_div5 if not + + li r8,-1 + $CLRU r8,r8,`$BITS/2` #q = BN_MASK2l + b Lppcasm_div6 +Lppcasm_div5: + $UDIV r8,r3,r9 #q = h/dh +Lppcasm_div6: + $UMULL r12,r9,r8 #th = q*dh + $CLRU r10,r5,`$BITS/2` #r10=dl + $UMULL r6,r8,r10 #tl = q*dl + +Lppcasm_divinnerloop: + subf r10,r12,r3 #t = h -th + $SHRI r7,r10,`$BITS/2` #r7= (t &BN_MASK2H), sort of... + addic. r7,r7,0 #test if r7 == 0. used below. + # now want to compute + # r7 = (t<>BN_BITS4) + # the following 2 instructions do that + $SHLI r7,r10,`$BITS/2` # r7 = (t<>BN_BITS4) + $UCMP 1,r6,r7 # compare (tl <= r7) + bc BO_IF_NOT,CR0_EQ,Lppcasm_divinnerexit + bc BO_IF_NOT,CR1_FEX,Lppcasm_divinnerexit + addi r8,r8,-1 #q-- + subf r12,r9,r12 #th -=dh + $CLRU r10,r5,`$BITS/2` #r10=dl. t is no longer needed in loop. + subf r6,r10,r6 #tl -=dl + b Lppcasm_divinnerloop +Lppcasm_divinnerexit: + $SHRI r10,r6,`$BITS/2` #t=(tl>>BN_BITS4) + $SHLI r11,r6,`$BITS/2` #tl=(tl<=tl) goto Lppcasm_div7 + addi r12,r12,1 # th++ +Lppcasm_div7: + subf r11,r11,r4 #r11=l-tl + $UCMP 1,r3,r12 #compare h and th + bc BO_IF_NOT,CR1_FX,Lppcasm_div8 #if (h>=th) goto Lppcasm_div8 + addi r8,r8,-1 # q-- + add r3,r5,r3 # h+=d +Lppcasm_div8: + subf r12,r12,r3 #r12 = h-th + $SHLI r4,r11,`$BITS/2` #l=(l&BN_MASK2l)<>BN_BITS4))&BN_MASK2 + # the following 2 instructions will do this. + $INSR r11,r12,`$BITS/2`,`$BITS/2` # r11 is the value we want rotated $BITS/2. + $ROTL r3,r11,`$BITS/2` # rotate by $BITS/2 and store in r3 + bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_div9#if (count==0) break ; + $SHLI r0,r8,`$BITS/2` #ret =q<> 2 + bc BO_IF,CR0_EQ,Lppcasm_mw_REM + mtctr r7 +Lppcasm_mw_LOOP: + #mul(rp[0],ap[0],w,c1); + $LD r8,`0*$BNSZ`(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + addc r9,r9,r12 + #addze r10,r10 #carry is NOT ignored. + #will be taken care of + #in second spin below + #using adde. + $ST r9,`0*$BNSZ`(r3) + #mul(rp[1],ap[1],w,c1); + $LD r8,`1*$BNSZ`(r4) + $UMULL r11,r6,r8 + $UMULH r12,r6,r8 + adde r11,r11,r10 + #addze r12,r12 + $ST r11,`1*$BNSZ`(r3) + #mul(rp[2],ap[2],w,c1); + $LD r8,`2*$BNSZ`(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + adde r9,r9,r12 + #addze r10,r10 + $ST r9,`2*$BNSZ`(r3) + #mul_add(rp[3],ap[3],w,c1); + $LD r8,`3*$BNSZ`(r4) + $UMULL r11,r6,r8 + $UMULH r12,r6,r8 + adde r11,r11,r10 + addze r12,r12 #this spin we collect carry into + #r12 + $ST r11,`3*$BNSZ`(r3) + + addi r3,r3,`4*$BNSZ` + addi r4,r4,`4*$BNSZ` + bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_mw_LOOP + +Lppcasm_mw_REM: + andi. r5,r5,0x3 + bc BO_IF,CR0_EQ,Lppcasm_mw_OVER + #mul(rp[0],ap[0],w,c1); + $LD r8,`0*$BNSZ`(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + addc r9,r9,r12 + addze r10,r10 + $ST r9,`0*$BNSZ`(r3) + addi r12,r10,0 + + addi r5,r5,-1 + cmpli 0,0,r5,0 + bc BO_IF,CR0_EQ,Lppcasm_mw_OVER + + + #mul(rp[1],ap[1],w,c1); + $LD r8,`1*$BNSZ`(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + addc r9,r9,r12 + addze r10,r10 + $ST r9,`1*$BNSZ`(r3) + addi r12,r10,0 + + addi r5,r5,-1 + cmpli 0,0,r5,0 + bc BO_IF,CR0_EQ,Lppcasm_mw_OVER + + #mul_add(rp[2],ap[2],w,c1); + $LD r8,`2*$BNSZ`(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + addc r9,r9,r12 + addze r10,r10 + $ST r9,`2*$BNSZ`(r3) + addi r12,r10,0 + +Lppcasm_mw_OVER: + addi r3,r12,0 + bclr BO_ALWAYS,CR0_LT + .long 0x00000000 + +# +# NOTE: The following label name should be changed to +# "bn_mul_add_words" i.e. remove the first dot +# for the gcc compiler. This should be automatically +# done in the build +# + +.align 4 +.bn_mul_add_words: +# +# BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) +# +# r3 = rp +# r4 = ap +# r5 = num +# r6 = w +# +# empirical evidence suggests that unrolled version performs best!! +# + xor r0,r0,r0 #r0 = 0 + xor r12,r12,r12 #r12 = 0 . used for carry + rlwinm. r7,r5,30,2,31 # num >> 2 + bc BO_IF,CR0_EQ,Lppcasm_maw_leftover # if (num < 4) go LPPCASM_maw_leftover + mtctr r7 +Lppcasm_maw_mainloop: + #mul_add(rp[0],ap[0],w,c1); + $LD r8,`0*$BNSZ`(r4) + $LD r11,`0*$BNSZ`(r3) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + addc r9,r9,r12 #r12 is carry. + addze r10,r10 + addc r9,r9,r11 + #addze r10,r10 + #the above instruction addze + #is NOT needed. Carry will NOT + #be ignored. It's not affected + #by multiply and will be collected + #in the next spin + $ST r9,`0*$BNSZ`(r3) + + #mul_add(rp[1],ap[1],w,c1); + $LD r8,`1*$BNSZ`(r4) + $LD r9,`1*$BNSZ`(r3) + $UMULL r11,r6,r8 + $UMULH r12,r6,r8 + adde r11,r11,r10 #r10 is carry. + addze r12,r12 + addc r11,r11,r9 + #addze r12,r12 + $ST r11,`1*$BNSZ`(r3) + + #mul_add(rp[2],ap[2],w,c1); + $LD r8,`2*$BNSZ`(r4) + $UMULL r9,r6,r8 + $LD r11,`2*$BNSZ`(r3) + $UMULH r10,r6,r8 + adde r9,r9,r12 + addze r10,r10 + addc r9,r9,r11 + #addze r10,r10 + $ST r9,`2*$BNSZ`(r3) + + #mul_add(rp[3],ap[3],w,c1); + $LD r8,`3*$BNSZ`(r4) + $UMULL r11,r6,r8 + $LD r9,`3*$BNSZ`(r3) + $UMULH r12,r6,r8 + adde r11,r11,r10 + addze r12,r12 + addc r11,r11,r9 + addze r12,r12 + $ST r11,`3*$BNSZ`(r3) + addi r3,r3,`4*$BNSZ` + addi r4,r4,`4*$BNSZ` + bc BO_dCTR_NZERO,CR0_EQ,Lppcasm_maw_mainloop + +Lppcasm_maw_leftover: + andi. r5,r5,0x3 + bc BO_IF,CR0_EQ,Lppcasm_maw_adios + addi r3,r3,-$BNSZ + addi r4,r4,-$BNSZ + #mul_add(rp[0],ap[0],w,c1); + mtctr r5 + $LDU r8,$BNSZ(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + $LDU r11,$BNSZ(r3) + addc r9,r9,r11 + addze r10,r10 + addc r9,r9,r12 + addze r12,r10 + $ST r9,0(r3) + + bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios + #mul_add(rp[1],ap[1],w,c1); + $LDU r8,$BNSZ(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + $LDU r11,$BNSZ(r3) + addc r9,r9,r11 + addze r10,r10 + addc r9,r9,r12 + addze r12,r10 + $ST r9,0(r3) + + bc BO_dCTR_ZERO,CR0_EQ,Lppcasm_maw_adios + #mul_add(rp[2],ap[2],w,c1); + $LDU r8,$BNSZ(r4) + $UMULL r9,r6,r8 + $UMULH r10,r6,r8 + $LDU r11,$BNSZ(r3) + addc r9,r9,r11 + addze r10,r10 + addc r9,r9,r12 + addze r12,r10 + $ST r9,0(r3) + +Lppcasm_maw_adios: + addi r3,r12,0 + bclr BO_ALWAYS,CR0_LT + .long 0x00000000 + .align 4 +EOF + $data =~ s/\`([^\`]*)\`/eval $1/gem; + + # if some assembler chokes on some simplified mnemonic, + # this is the spot to fix it up, e.g.: + # GNU as doesn't seem to accept cmplw, 32-bit unsigned compare + $data =~ s/^(\s*)cmplw(\s+)([^,]+),(.*)/$1cmpl$2$3,0,$4/gm; + # assembler X doesn't accept li, load immediate value + #$data =~ s/^(\s*)li(\s+)([^,]+),(.*)/$1addi$2$3,0,$4/gm; + return($data); +} diff --git a/src/lib/libcrypto/bn/asm/x86_64-gcc.c b/src/lib/libcrypto/bn/asm/x86_64-gcc.c index 450e8e4322..7378344251 100644 --- a/src/lib/libcrypto/bn/asm/x86_64-gcc.c +++ b/src/lib/libcrypto/bn/asm/x86_64-gcc.c @@ -13,20 +13,42 @@ * A. Well, that's because this code is basically a quick-n-dirty * proof-of-concept hack. As you can see it's implemented with * inline assembler, which means that you're bound to GCC and that - * there must be a room for fine-tuning. + * there might be enough room for further improvement. * * Q. Why inline assembler? - * A. x86_64 features own ABI I'm not familiar with. Which is why - * I decided to let the compiler take care of subroutine - * prologue/epilogue as well as register allocation. + * A. x86_64 features own ABI which I'm not familiar with. This is + * why I decided to let the compiler take care of subroutine + * prologue/epilogue as well as register allocation. For reference. + * Win64 implements different ABI for AMD64, different from Linux. * * Q. How much faster does it get? - * A. Unfortunately people sitting on x86_64 hardware are prohibited - * to disclose the performance numbers, so they (SuSE labs to be - * specific) wouldn't tell me. However! Very similar coding technique - * (reaching out for 128-bit result from 64x64-bit multiplication) - * results in >3 times performance improvement on MIPS and I see no - * reason why gain on x86_64 would be so much different:-) + * A. 'apps/openssl speed rsa dsa' output with no-asm: + * + * sign verify sign/s verify/s + * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 + * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 + * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 + * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 + * sign verify sign/s verify/s + * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 + * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 + * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 + * + * 'apps/openssl speed rsa dsa' output with this module: + * + * sign verify sign/s verify/s + * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 + * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 + * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 + * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 + * sign verify sign/s verify/s + * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 + * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 + * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 + * + * For the reference. IA-32 assembler implementation performs + * very much like 64-bit code compiled with no-asm on the same + * machine. */ #define BN_ULONG unsigned long @@ -151,7 +173,7 @@ BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) } BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) -{ BN_ULONG ret,i; +{ BN_ULONG ret=0,i=0; if (n <= 0) return 0; @@ -164,7 +186,7 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) " leaq 1(%2),%2 \n" " loop 1b \n" " sbbq %0,%0 \n" - : "+a"(ret),"+c"(n),"+r"(i) + : "=&a"(ret),"+c"(n),"=&r"(i) : "r"(rp),"r"(ap),"r"(bp) : "cc" ); @@ -174,7 +196,7 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) #ifndef SIMICS BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) -{ BN_ULONG ret,i; +{ BN_ULONG ret=0,i=0; if (n <= 0) return 0; @@ -187,7 +209,7 @@ BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) " leaq 1(%2),%2 \n" " loop 1b \n" " sbbq %0,%0 \n" - : "+a"(ret),"+c"(n),"+r"(i) + : "=&a"(ret),"+c"(n),"=&r"(i) : "r"(rp),"r"(ap),"r"(bp) : "cc" ); @@ -318,7 +340,6 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { - BN_ULONG bl,bh; BN_ULONG t1,t2; BN_ULONG c1,c2,c3; @@ -423,7 +444,6 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) { - BN_ULONG bl,bh; BN_ULONG t1,t2; BN_ULONG c1,c2,c3; @@ -464,7 +484,6 @@ void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) { - BN_ULONG bl,bh; BN_ULONG t1,t2; BN_ULONG c1,c2,c3; @@ -541,7 +560,6 @@ void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a) void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a) { - BN_ULONG bl,bh; BN_ULONG t1,t2; BN_ULONG c1,c2,c3; -- cgit v1.2.3-55-g6feb