diff options
| author | cvs2svn <admin@example.com> | 2015-03-08 16:48:48 +0000 |
|---|---|---|
| committer | cvs2svn <admin@example.com> | 2015-03-08 16:48:48 +0000 |
| commit | da1a9ad3a4a867ba6569c05e6fca66d7f296c553 (patch) | |
| tree | 44872802e872bdfd60730fa9cf01d9d5751251c1 /src/lib/libcrypto/bn/asm/armv4-gf2m.pl | |
| parent | 973703db67a8e73d70e63afa8f2cde19da09144d (diff) | |
| download | openbsd-OPENBSD_5_7_BASE.tar.gz openbsd-OPENBSD_5_7_BASE.tar.bz2 openbsd-OPENBSD_5_7_BASE.zip | |
This commit was manufactured by cvs2git to create tag 'OPENBSD_5_7_BASE'.OPENBSD_5_7_BASE
Diffstat (limited to 'src/lib/libcrypto/bn/asm/armv4-gf2m.pl')
| -rw-r--r-- | src/lib/libcrypto/bn/asm/armv4-gf2m.pl | 278 |
1 files changed, 0 insertions, 278 deletions
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl deleted file mode 100644 index c52e0b75b5..0000000000 --- a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl +++ /dev/null | |||
| @@ -1,278 +0,0 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL | ||
| 5 | # project. The module is, however, dual licensed under OpenSSL and | ||
| 6 | # CRYPTOGAMS licenses depending on where you obtain it. For further | ||
| 7 | # details see http://www.openssl.org/~appro/cryptogams/. | ||
| 8 | # ==================================================================== | ||
| 9 | # | ||
| 10 | # May 2011 | ||
| 11 | # | ||
| 12 | # The module implements bn_GF2m_mul_2x2 polynomial multiplication | ||
| 13 | # used in bn_gf2m.c. It's kind of low-hanging mechanical port from | ||
| 14 | # C for the time being... Except that it has two code paths: pure | ||
| 15 | # integer code suitable for any ARMv4 and later CPU and NEON code | ||
| 16 | # suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs | ||
| 17 | # in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% | ||
| 18 | # faster than compiler-generated code. For ECDH and ECDSA verify (but | ||
| 19 | # not for ECDSA sign) it means 25%-45% improvement depending on key | ||
| 20 | # length, more for longer keys. Even though NEON 1x1 multiplication | ||
| 21 | # runs in even less cycles, ~30, improvement is measurable only on | ||
| 22 | # longer keys. One has to optimize code elsewhere to get NEON glow... | ||
| 23 | |||
| 24 | while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} | ||
| 25 | open STDOUT,">$output"; | ||
| 26 | |||
| 27 | sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } | ||
| 28 | sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } | ||
| 29 | sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } | ||
| 30 | |||
| 31 | $code=<<___; | ||
| 32 | #include "arm_arch.h" | ||
| 33 | |||
| 34 | .text | ||
| 35 | .code 32 | ||
| 36 | |||
| 37 | #if __ARM_ARCH__>=7 | ||
| 38 | .fpu neon | ||
| 39 | |||
| 40 | .type mul_1x1_neon,%function | ||
| 41 | .align 5 | ||
| 42 | mul_1x1_neon: | ||
| 43 | vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a | ||
| 44 | vmull.p8 `&Q("d0")`,d16,d17 @ a·bb | ||
| 45 | vshl.u64 `&Dlo("q2")`,d16,#16 | ||
| 46 | vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb | ||
| 47 | vshl.u64 `&Dlo("q3")`,d16,#24 | ||
| 48 | vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb | ||
| 49 | vshr.u64 `&Dlo("q1")`,#8 | ||
| 50 | vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb | ||
| 51 | vshl.u64 `&Dhi("q1")`,#24 | ||
| 52 | veor d0,`&Dlo("q1")` | ||
| 53 | vshr.u64 `&Dlo("q2")`,#16 | ||
| 54 | veor d0,`&Dhi("q1")` | ||
| 55 | vshl.u64 `&Dhi("q2")`,#16 | ||
| 56 | veor d0,`&Dlo("q2")` | ||
| 57 | vshr.u64 `&Dlo("q3")`,#24 | ||
| 58 | veor d0,`&Dhi("q2")` | ||
| 59 | vshl.u64 `&Dhi("q3")`,#8 | ||
| 60 | veor d0,`&Dlo("q3")` | ||
| 61 | veor d0,`&Dhi("q3")` | ||
| 62 | bx lr | ||
| 63 | .size mul_1x1_neon,.-mul_1x1_neon | ||
| 64 | #endif | ||
| 65 | ___ | ||
| 66 | ################ | ||
| 67 | # private interface to mul_1x1_ialu | ||
| 68 | # | ||
| 69 | $a="r1"; | ||
| 70 | $b="r0"; | ||
| 71 | |||
| 72 | ($a0,$a1,$a2,$a12,$a4,$a14)= | ||
| 73 | ($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); | ||
| 74 | |||
| 75 | $mask="r12"; | ||
| 76 | |||
| 77 | $code.=<<___; | ||
| 78 | .type mul_1x1_ialu,%function | ||
| 79 | .align 5 | ||
| 80 | mul_1x1_ialu: | ||
| 81 | mov $a0,#0 | ||
| 82 | bic $a1,$a,#3<<30 @ a1=a&0x3fffffff | ||
| 83 | str $a0,[sp,#0] @ tab[0]=0 | ||
| 84 | add $a2,$a1,$a1 @ a2=a1<<1 | ||
| 85 | str $a1,[sp,#4] @ tab[1]=a1 | ||
| 86 | eor $a12,$a1,$a2 @ a1^a2 | ||
| 87 | str $a2,[sp,#8] @ tab[2]=a2 | ||
| 88 | mov $a4,$a1,lsl#2 @ a4=a1<<2 | ||
| 89 | str $a12,[sp,#12] @ tab[3]=a1^a2 | ||
| 90 | eor $a14,$a1,$a4 @ a1^a4 | ||
| 91 | str $a4,[sp,#16] @ tab[4]=a4 | ||
| 92 | eor $a0,$a2,$a4 @ a2^a4 | ||
| 93 | str $a14,[sp,#20] @ tab[5]=a1^a4 | ||
| 94 | eor $a12,$a12,$a4 @ a1^a2^a4 | ||
| 95 | str $a0,[sp,#24] @ tab[6]=a2^a4 | ||
| 96 | and $i0,$mask,$b,lsl#2 | ||
| 97 | str $a12,[sp,#28] @ tab[7]=a1^a2^a4 | ||
| 98 | |||
| 99 | and $i1,$mask,$b,lsr#1 | ||
| 100 | ldr $lo,[sp,$i0] @ tab[b & 0x7] | ||
| 101 | and $i0,$mask,$b,lsr#4 | ||
| 102 | ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] | ||
| 103 | and $i1,$mask,$b,lsr#7 | ||
| 104 | ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] | ||
| 105 | eor $lo,$lo,$t1,lsl#3 @ stall | ||
| 106 | mov $hi,$t1,lsr#29 | ||
| 107 | ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] | ||
| 108 | |||
| 109 | and $i0,$mask,$b,lsr#10 | ||
| 110 | eor $lo,$lo,$t0,lsl#6 | ||
| 111 | eor $hi,$hi,$t0,lsr#26 | ||
| 112 | ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] | ||
| 113 | |||
| 114 | and $i1,$mask,$b,lsr#13 | ||
| 115 | eor $lo,$lo,$t1,lsl#9 | ||
| 116 | eor $hi,$hi,$t1,lsr#23 | ||
| 117 | ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] | ||
| 118 | |||
| 119 | and $i0,$mask,$b,lsr#16 | ||
| 120 | eor $lo,$lo,$t0,lsl#12 | ||
| 121 | eor $hi,$hi,$t0,lsr#20 | ||
| 122 | ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] | ||
| 123 | |||
| 124 | and $i1,$mask,$b,lsr#19 | ||
| 125 | eor $lo,$lo,$t1,lsl#15 | ||
| 126 | eor $hi,$hi,$t1,lsr#17 | ||
| 127 | ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] | ||
| 128 | |||
| 129 | and $i0,$mask,$b,lsr#22 | ||
| 130 | eor $lo,$lo,$t0,lsl#18 | ||
| 131 | eor $hi,$hi,$t0,lsr#14 | ||
| 132 | ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] | ||
| 133 | |||
| 134 | and $i1,$mask,$b,lsr#25 | ||
| 135 | eor $lo,$lo,$t1,lsl#21 | ||
| 136 | eor $hi,$hi,$t1,lsr#11 | ||
| 137 | ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] | ||
| 138 | |||
| 139 | tst $a,#1<<30 | ||
| 140 | and $i0,$mask,$b,lsr#28 | ||
| 141 | eor $lo,$lo,$t0,lsl#24 | ||
| 142 | eor $hi,$hi,$t0,lsr#8 | ||
| 143 | ldr $t0,[sp,$i0] @ tab[b >> 30 ] | ||
| 144 | |||
| 145 | eorne $lo,$lo,$b,lsl#30 | ||
| 146 | eorne $hi,$hi,$b,lsr#2 | ||
| 147 | tst $a,#1<<31 | ||
| 148 | eor $lo,$lo,$t1,lsl#27 | ||
| 149 | eor $hi,$hi,$t1,lsr#5 | ||
| 150 | eorne $lo,$lo,$b,lsl#31 | ||
| 151 | eorne $hi,$hi,$b,lsr#1 | ||
| 152 | eor $lo,$lo,$t0,lsl#30 | ||
| 153 | eor $hi,$hi,$t0,lsr#2 | ||
| 154 | |||
| 155 | mov pc,lr | ||
| 156 | .size mul_1x1_ialu,.-mul_1x1_ialu | ||
| 157 | ___ | ||
| 158 | ################ | ||
| 159 | # void bn_GF2m_mul_2x2(BN_ULONG *r, | ||
| 160 | # BN_ULONG a1,BN_ULONG a0, | ||
| 161 | # BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 | ||
| 162 | |||
| 163 | ($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); | ||
| 164 | |||
| 165 | $code.=<<___; | ||
| 166 | .global bn_GF2m_mul_2x2 | ||
| 167 | .type bn_GF2m_mul_2x2,%function | ||
| 168 | .align 5 | ||
| 169 | bn_GF2m_mul_2x2: | ||
| 170 | #if __ARM_ARCH__>=7 | ||
| 171 | ldr r12,.LOPENSSL_armcap | ||
| 172 | .Lpic: ldr r12,[pc,r12] | ||
| 173 | tst r12,#1 | ||
| 174 | beq .Lialu | ||
| 175 | |||
| 176 | veor $A1,$A1 | ||
| 177 | vmov.32 $B1,r3,r3 @ two copies of b1 | ||
| 178 | vmov.32 ${A1}[0],r1 @ a1 | ||
| 179 | |||
| 180 | veor $A0,$A0 | ||
| 181 | vld1.32 ${B0}[],[sp,:32] @ two copies of b0 | ||
| 182 | vmov.32 ${A0}[0],r2 @ a0 | ||
| 183 | mov r12,lr | ||
| 184 | |||
| 185 | vmov d16,$A1 | ||
| 186 | vmov d17,$B1 | ||
| 187 | bl mul_1x1_neon @ a1·b1 | ||
| 188 | vmov $A1B1,d0 | ||
| 189 | |||
| 190 | vmov d16,$A0 | ||
| 191 | vmov d17,$B0 | ||
| 192 | bl mul_1x1_neon @ a0·b0 | ||
| 193 | vmov $A0B0,d0 | ||
| 194 | |||
| 195 | veor d16,$A0,$A1 | ||
| 196 | veor d17,$B0,$B1 | ||
| 197 | veor $A0,$A0B0,$A1B1 | ||
| 198 | bl mul_1x1_neon @ (a0+a1)·(b0+b1) | ||
| 199 | |||
| 200 | veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 | ||
| 201 | vshl.u64 d1,d0,#32 | ||
| 202 | vshr.u64 d0,d0,#32 | ||
| 203 | veor $A0B0,d1 | ||
| 204 | veor $A1B1,d0 | ||
| 205 | vst1.32 {${A0B0}[0]},[r0,:32]! | ||
| 206 | vst1.32 {${A0B0}[1]},[r0,:32]! | ||
| 207 | vst1.32 {${A1B1}[0]},[r0,:32]! | ||
| 208 | vst1.32 {${A1B1}[1]},[r0,:32] | ||
| 209 | bx r12 | ||
| 210 | .align 4 | ||
| 211 | .Lialu: | ||
| 212 | #endif | ||
| 213 | ___ | ||
| 214 | $ret="r10"; # reassigned 1st argument | ||
| 215 | $code.=<<___; | ||
| 216 | stmdb sp!,{r4-r10,lr} | ||
| 217 | mov $ret,r0 @ reassign 1st argument | ||
| 218 | mov $b,r3 @ $b=b1 | ||
| 219 | ldr r3,[sp,#32] @ load b0 | ||
| 220 | mov $mask,#7<<2 | ||
| 221 | sub sp,sp,#32 @ allocate tab[8] | ||
| 222 | |||
| 223 | bl mul_1x1_ialu @ a1·b1 | ||
| 224 | str $lo,[$ret,#8] | ||
| 225 | str $hi,[$ret,#12] | ||
| 226 | |||
| 227 | eor $b,$b,r3 @ flip b0 and b1 | ||
| 228 | eor $a,$a,r2 @ flip a0 and a1 | ||
| 229 | eor r3,r3,$b | ||
| 230 | eor r2,r2,$a | ||
| 231 | eor $b,$b,r3 | ||
| 232 | eor $a,$a,r2 | ||
| 233 | bl mul_1x1_ialu @ a0·b0 | ||
| 234 | str $lo,[$ret] | ||
| 235 | str $hi,[$ret,#4] | ||
| 236 | |||
| 237 | eor $a,$a,r2 | ||
| 238 | eor $b,$b,r3 | ||
| 239 | bl mul_1x1_ialu @ (a1+a0)·(b1+b0) | ||
| 240 | ___ | ||
| 241 | @r=map("r$_",(6..9)); | ||
| 242 | $code.=<<___; | ||
| 243 | ldmia $ret,{@r[0]-@r[3]} | ||
| 244 | eor $lo,$lo,$hi | ||
| 245 | eor $hi,$hi,@r[1] | ||
| 246 | eor $lo,$lo,@r[0] | ||
| 247 | eor $hi,$hi,@r[2] | ||
| 248 | eor $lo,$lo,@r[3] | ||
| 249 | eor $hi,$hi,@r[3] | ||
| 250 | str $hi,[$ret,#8] | ||
| 251 | eor $lo,$lo,$hi | ||
| 252 | add sp,sp,#32 @ destroy tab[8] | ||
| 253 | str $lo,[$ret,#4] | ||
| 254 | |||
| 255 | #if __ARM_ARCH__>=5 | ||
| 256 | ldmia sp!,{r4-r10,pc} | ||
| 257 | #else | ||
| 258 | ldmia sp!,{r4-r10,lr} | ||
| 259 | tst lr,#1 | ||
| 260 | moveq pc,lr @ be binary compatible with V4, yet | ||
| 261 | bx lr @ interoperable with Thumb ISA:-) | ||
| 262 | #endif | ||
| 263 | .size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 | ||
| 264 | #if __ARM_ARCH__>=7 | ||
| 265 | .align 5 | ||
| 266 | .LOPENSSL_armcap: | ||
| 267 | .word OPENSSL_armcap_P-(.Lpic+8) | ||
| 268 | #endif | ||
| 269 | .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>" | ||
| 270 | .align 5 | ||
| 271 | |||
| 272 | .comm OPENSSL_armcap_P,4,4 | ||
| 273 | ___ | ||
| 274 | |||
| 275 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 276 | $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 | ||
| 277 | print $code; | ||
| 278 | close STDOUT; # enforce flush | ||
