diff options
Diffstat (limited to 'src/lib/libcrypto/rc4/asm')
| -rwxr-xr-x | src/lib/libcrypto/rc4/asm/rc4-amd64.pl | 227 | ||||
| -rw-r--r-- | src/lib/libcrypto/rc4/asm/rc4-ia64.S | 159 | ||||
| -rwxr-xr-x | src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | 7 |
3 files changed, 388 insertions, 5 deletions
diff --git a/src/lib/libcrypto/rc4/asm/rc4-amd64.pl b/src/lib/libcrypto/rc4/asm/rc4-amd64.pl new file mode 100755 index 0000000000..9e0da8af99 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-amd64.pl | |||
| @@ -0,0 +1,227 @@ | |||
| 1 | #!/usr/bin/env perl | ||
| 2 | # | ||
| 3 | # ==================================================================== | ||
| 4 | # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 5 | # project. Rights for redistribution and usage in source and binary | ||
| 6 | # forms are granted according to the OpenSSL license. | ||
| 7 | # ==================================================================== | ||
| 8 | # | ||
| 9 | # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in | ||
| 10 | # "hand-coded assembler"] doesn't stand for the whole improvement | ||
| 11 | # coefficient. It turned out that eliminating RC4_CHAR from config | ||
| 12 | # line results in ~40% improvement (yes, even for C implementation). | ||
| 13 | # Presumably it has everything to do with AMD cache architecture and | ||
| 14 | # RAW or whatever penalties. Once again! The module *requires* config | ||
| 15 | # line *without* RC4_CHAR! As for coding "secret," I bet on partial | ||
| 16 | # register arithmetics. For example instead of 'inc %r8; and $255,%r8' | ||
| 17 | # I simply 'inc %r8b'. Even though optimization manual discourages | ||
| 18 | # to operate on partial registers, it turned out to be the best bet. | ||
| 19 | # At least for AMD... How IA32E would perform remains to be seen... | ||
| 20 | |||
| 21 | # As was shown by Marc Bevand reordering of couple of load operations | ||
| 22 | # results in even higher performance gain of 3.3x:-) At least on | ||
| 23 | # Opteron... For reference, 1x in this case is RC4_CHAR C-code | ||
| 24 | # compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock. | ||
| 25 | # Latter means that if you want to *estimate* what to expect from | ||
| 26 | # *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz. | ||
| 27 | |||
| 28 | # Intel P4 EM64T core was found to run the AMD64 code really slow... | ||
| 29 | # The only way to achieve comparable performance on P4 is to keep | ||
| 30 | # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to | ||
| 31 | # compose blended code, which would perform even within 30% marginal | ||
| 32 | # on either AMD and Intel platforms, I implement both cases. See | ||
| 33 | # rc4_skey.c for further details... This applies to 0.9.8 and later. | ||
| 34 | # In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes | ||
| 35 | # of code remain redundant. | ||
| 36 | |||
| 37 | $output=shift; | ||
| 38 | |||
| 39 | $win64a=1 if ($output =~ /win64a.[s|asm]/); | ||
| 40 | |||
| 41 | open STDOUT,">$output" || die "can't open $output: $!"; | ||
| 42 | |||
| 43 | if (defined($win64a)) { | ||
| 44 | $dat="%rcx"; # arg1 | ||
| 45 | $len="%rdx"; # arg2 | ||
| 46 | $inp="%rsi"; # r8, arg3 moves here | ||
| 47 | $out="%rdi"; # r9, arg4 moves here | ||
| 48 | } else { | ||
| 49 | $dat="%rdi"; # arg1 | ||
| 50 | $len="%rsi"; # arg2 | ||
| 51 | $inp="%rdx"; # arg3 | ||
| 52 | $out="%rcx"; # arg4 | ||
| 53 | } | ||
| 54 | |||
| 55 | $XX="%r10"; | ||
| 56 | $TX="%r8"; | ||
| 57 | $YY="%r11"; | ||
| 58 | $TY="%r9"; | ||
| 59 | |||
| 60 | sub PTR() { | ||
| 61 | my $ret=shift; | ||
| 62 | if (defined($win64a)) { | ||
| 63 | $ret =~ s/\[([\S]+)\+([\S]+)\]/[$2+$1]/g; # [%rN+%rM*4]->[%rM*4+%rN] | ||
| 64 | $ret =~ s/:([^\[]+)\[([^\]]+)\]/:[$2+$1]/g; # :off[ea]->:[ea+off] | ||
| 65 | } else { | ||
| 66 | $ret =~ s/[\+\*]/,/g; # [%rN+%rM*4]->[%rN,%rM,4] | ||
| 67 | $ret =~ s/\[([^\]]+)\]/($1)/g; # [%rN]->(%rN) | ||
| 68 | } | ||
| 69 | $ret; | ||
| 70 | } | ||
| 71 | |||
| 72 | $code=<<___ if (!defined($win64a)); | ||
| 73 | .text | ||
| 74 | |||
| 75 | .globl RC4 | ||
| 76 | .type RC4,\@function | ||
| 77 | .align 16 | ||
| 78 | RC4: or $len,$len | ||
| 79 | jne .Lentry | ||
| 80 | repret | ||
| 81 | .Lentry: | ||
| 82 | ___ | ||
| 83 | $code=<<___ if (defined($win64a)); | ||
| 84 | _TEXT SEGMENT | ||
| 85 | PUBLIC RC4 | ||
| 86 | ALIGN 16 | ||
| 87 | RC4 PROC | ||
| 88 | or $len,$len | ||
| 89 | jne .Lentry | ||
| 90 | repret | ||
| 91 | .Lentry: | ||
| 92 | push %rdi | ||
| 93 | push %rsi | ||
| 94 | sub \$40,%rsp | ||
| 95 | mov %r8,$inp | ||
| 96 | mov %r9,$out | ||
| 97 | ___ | ||
| 98 | $code.=<<___; | ||
| 99 | add \$8,$dat | ||
| 100 | movl `&PTR("DWORD:-8[$dat]")`,$XX#d | ||
| 101 | movl `&PTR("DWORD:-4[$dat]")`,$YY#d | ||
| 102 | cmpl \$-1,`&PTR("DWORD:256[$dat]")` | ||
| 103 | je .LRC4_CHAR | ||
| 104 | test \$-8,$len | ||
| 105 | jz .Lloop1 | ||
| 106 | .align 16 | ||
| 107 | .Lloop8: | ||
| 108 | inc $XX#b | ||
| 109 | movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d | ||
| 110 | add $TX#b,$YY#b | ||
| 111 | movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d | ||
| 112 | movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` | ||
| 113 | movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` | ||
| 114 | add $TX#b,$TY#b | ||
| 115 | inc $XX#b | ||
| 116 | movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d | ||
| 117 | movb `&PTR("BYTE:[$dat+$TY*4]")`,%al | ||
| 118 | ___ | ||
| 119 | for ($i=1;$i<=6;$i++) { | ||
| 120 | $code.=<<___; | ||
| 121 | add $TX#b,$YY#b | ||
| 122 | ror \$8,%rax | ||
| 123 | movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d | ||
| 124 | movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` | ||
| 125 | movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` | ||
| 126 | add $TX#b,$TY#b | ||
| 127 | inc $XX#b | ||
| 128 | movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d | ||
| 129 | movb `&PTR("BYTE:[$dat+$TY*4]")`,%al | ||
| 130 | ___ | ||
| 131 | } | ||
| 132 | $code.=<<___; | ||
| 133 | add $TX#b,$YY#b | ||
| 134 | ror \$8,%rax | ||
| 135 | movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d | ||
| 136 | movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` | ||
| 137 | movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` | ||
| 138 | sub \$8,$len | ||
| 139 | add $TY#b,$TX#b | ||
| 140 | movb `&PTR("BYTE:[$dat+$TX*4]")`,%al | ||
| 141 | ror \$8,%rax | ||
| 142 | add \$8,$inp | ||
| 143 | add \$8,$out | ||
| 144 | |||
| 145 | xor `&PTR("QWORD:-8[$inp]")`,%rax | ||
| 146 | mov %rax,`&PTR("QWORD:-8[$out]")` | ||
| 147 | |||
| 148 | test \$-8,$len | ||
| 149 | jnz .Lloop8 | ||
| 150 | cmp \$0,$len | ||
| 151 | jne .Lloop1 | ||
| 152 | .Lexit: | ||
| 153 | movl $XX#d,`&PTR("DWORD:-8[$dat]")` | ||
| 154 | movl $YY#d,`&PTR("DWORD:-4[$dat]")` | ||
| 155 | ___ | ||
| 156 | $code.=<<___ if (defined($win64a)); | ||
| 157 | add \$40,%rsp | ||
| 158 | pop %rsi | ||
| 159 | pop %rdi | ||
| 160 | ___ | ||
| 161 | $code.=<<___; | ||
| 162 | repret | ||
| 163 | .align 16 | ||
| 164 | .Lloop1: | ||
| 165 | movzb `&PTR("BYTE:[$inp]")`,%eax | ||
| 166 | inc $XX#b | ||
| 167 | movl `&PTR("DWORD:[$dat+$XX*4]")`,$TX#d | ||
| 168 | add $TX#b,$YY#b | ||
| 169 | movl `&PTR("DWORD:[$dat+$YY*4]")`,$TY#d | ||
| 170 | movl $TX#d,`&PTR("DWORD:[$dat+$YY*4]")` | ||
| 171 | movl $TY#d,`&PTR("DWORD:[$dat+$XX*4]")` | ||
| 172 | add $TY#b,$TX#b | ||
| 173 | movl `&PTR("DWORD:[$dat+$TX*4]")`,$TY#d | ||
| 174 | xor $TY,%rax | ||
| 175 | inc $inp | ||
| 176 | movb %al,`&PTR("BYTE:[$out]")` | ||
| 177 | inc $out | ||
| 178 | dec $len | ||
| 179 | jnz .Lloop1 | ||
| 180 | jmp .Lexit | ||
| 181 | |||
| 182 | .align 16 | ||
| 183 | .LRC4_CHAR: | ||
| 184 | inc $XX#b | ||
| 185 | movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d | ||
| 186 | add $TX#b,$YY#b | ||
| 187 | movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d | ||
| 188 | movb $TX#b,`&PTR("BYTE:[$dat+$YY]")` | ||
| 189 | movb $TY#b,`&PTR("BYTE:[$dat+$XX]")` | ||
| 190 | add $TX#b,$TY#b | ||
| 191 | movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d | ||
| 192 | xorb `&PTR("BYTE:[$inp]")`,$TY#b | ||
| 193 | movb $TY#b,`&PTR("BYTE:[$out]")` | ||
| 194 | inc $inp | ||
| 195 | inc $out | ||
| 196 | dec $len | ||
| 197 | jnz .LRC4_CHAR | ||
| 198 | jmp .Lexit | ||
| 199 | ___ | ||
| 200 | $code.=<<___ if (defined($win64a)); | ||
| 201 | RC4 ENDP | ||
| 202 | _TEXT ENDS | ||
| 203 | END | ||
| 204 | ___ | ||
| 205 | $code.=<<___ if (!defined($win64a)); | ||
| 206 | .size RC4,.-RC4 | ||
| 207 | ___ | ||
| 208 | |||
| 209 | $code =~ s/#([bwd])/$1/gm; | ||
| 210 | $code =~ s/\`([^\`]*)\`/eval $1/gem; | ||
| 211 | |||
| 212 | if (defined($win64a)) { | ||
| 213 | $code =~ s/\.align/ALIGN/gm; | ||
| 214 | $code =~ s/[\$%]//gm; | ||
| 215 | $code =~ s/\.L/\$L/gm; | ||
| 216 | $code =~ s/([\w]+)([\s]+)([\S]+),([\S]+)/$1$2$4,$3/gm; | ||
| 217 | $code =~ s/([QD]*WORD|BYTE):/$1 PTR/gm; | ||
| 218 | $code =~ s/mov[bwlq]/mov/gm; | ||
| 219 | $code =~ s/movzb/movzx/gm; | ||
| 220 | $code =~ s/repret/DB\t0F3h,0C3h/gm; | ||
| 221 | $code =~ s/cmpl/cmp/gm; | ||
| 222 | $code =~ s/xorb/xor/gm; | ||
| 223 | } else { | ||
| 224 | $code =~ s/([QD]*WORD|BYTE)://gm; | ||
| 225 | $code =~ s/repret/.byte\t0xF3,0xC3/gm; | ||
| 226 | } | ||
| 227 | print $code; | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-ia64.S b/src/lib/libcrypto/rc4/asm/rc4-ia64.S new file mode 100644 index 0000000000..8210c47d04 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-ia64.S | |||
| @@ -0,0 +1,159 @@ | |||
| 1 | // ==================================================================== | ||
| 2 | // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL | ||
| 3 | // project. | ||
| 4 | // | ||
| 5 | // Rights for redistribution and usage in source and binary forms are | ||
| 6 | // granted according to the OpenSSL license. Warranty of any kind is | ||
| 7 | // disclaimed. | ||
| 8 | // ==================================================================== | ||
| 9 | |||
| 10 | .ident "rc4-ia64.S, Version 2.0" | ||
| 11 | .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" | ||
| 12 | |||
| 13 | // What's wrong with compiler generated code? Because of the nature of | ||
| 14 | // C language, compiler doesn't [dare to] reorder load and stores. But | ||
| 15 | // being memory-bound, RC4 should benefit from reorder [on in-order- | ||
| 16 | // execution core such as IA-64]. But what can we reorder? At the very | ||
| 17 | // least we can safely reorder references to key schedule in respect | ||
| 18 | // to input and output streams. Secondly, from the first [close] glance | ||
| 19 | // it appeared that it's possible to pull up some references to | ||
| 20 | // elements of the key schedule itself. Original rationale ["prior | ||
| 21 | // loads are not safe only for "degenerated" key schedule, when some | ||
| 22 | // elements equal to the same value"] was kind of sloppy. I should have | ||
| 23 | // formulated as it really was: if we assume that pulling up reference | ||
| 24 | // to key[x+1] is not safe, then it would mean that key schedule would | ||
| 25 | // "degenerate," which is never the case. The problem is that this | ||
| 26 | // holds true in respect to references to key[x], but not to key[y]. | ||
| 27 | // Legitimate "collisions" do occur within every 256^2 bytes window. | ||
| 28 | // Fortunately there're enough free instruction slots to keep prior | ||
| 29 | // reference to key[x+1], detect "collision" and compensate for it. | ||
| 30 | // All this without sacrificing a single clock cycle:-) Throughput is | ||
| 31 | // ~210MBps on 900MHz CPU, which is is >3x faster than gcc generated | ||
| 32 | // code and +30% - if compared to HP-UX C. Unrolling loop below should | ||
| 33 | // give >30% on top of that... | ||
| 34 | |||
| 35 | .text | ||
| 36 | .explicit | ||
| 37 | |||
| 38 | #if defined(_HPUX_SOURCE) && !defined(_LP64) | ||
| 39 | # define ADDP addp4 | ||
| 40 | #else | ||
| 41 | # define ADDP add | ||
| 42 | #endif | ||
| 43 | |||
| 44 | #ifndef SZ | ||
| 45 | #define SZ 4 // this is set to sizeof(RC4_INT) | ||
| 46 | #endif | ||
| 47 | // SZ==4 seems to be optimal. At least SZ==8 is not any faster, not for | ||
| 48 | // assembler implementation, while SZ==1 code is ~30% slower. | ||
| 49 | #if SZ==1 // RC4_INT is unsigned char | ||
| 50 | # define LDKEY ld1 | ||
| 51 | # define STKEY st1 | ||
| 52 | # define OFF 0 | ||
| 53 | #elif SZ==4 // RC4_INT is unsigned int | ||
| 54 | # define LDKEY ld4 | ||
| 55 | # define STKEY st4 | ||
| 56 | # define OFF 2 | ||
| 57 | #elif SZ==8 // RC4_INT is unsigned long | ||
| 58 | # define LDKEY ld8 | ||
| 59 | # define STKEY st8 | ||
| 60 | # define OFF 3 | ||
| 61 | #endif | ||
| 62 | |||
| 63 | out=r8; // [expanded] output pointer | ||
| 64 | inp=r9; // [expanded] output pointer | ||
| 65 | prsave=r10; | ||
| 66 | key=r28; // [expanded] pointer to RC4_KEY | ||
| 67 | ksch=r29; // (key->data+255)[&~(sizeof(key->data)-1)] | ||
| 68 | xx=r30; | ||
| 69 | yy=r31; | ||
| 70 | |||
| 71 | // void RC4(RC4_KEY *key,size_t len,const void *inp,void *out); | ||
| 72 | .global RC4# | ||
| 73 | .proc RC4# | ||
| 74 | .align 32 | ||
| 75 | .skip 16 | ||
| 76 | RC4: | ||
| 77 | .prologue | ||
| 78 | .save ar.pfs,r2 | ||
| 79 | { .mii; alloc r2=ar.pfs,4,12,0,16 | ||
| 80 | .save pr,prsave | ||
| 81 | mov prsave=pr | ||
| 82 | ADDP key=0,in0 };; | ||
| 83 | { .mib; cmp.eq p6,p0=0,in1 // len==0? | ||
| 84 | .save ar.lc,r3 | ||
| 85 | mov r3=ar.lc | ||
| 86 | (p6) br.ret.spnt.many b0 };; // emergency exit | ||
| 87 | |||
| 88 | .body | ||
| 89 | .rotr dat[4],key_x[4],tx[2],rnd[2],key_y[2],ty[1]; | ||
| 90 | |||
| 91 | { .mib; LDKEY xx=[key],SZ // load key->x | ||
| 92 | add in1=-1,in1 // adjust len for loop counter | ||
| 93 | nop.b 0 } | ||
| 94 | { .mib; ADDP inp=0,in2 | ||
| 95 | ADDP out=0,in3 | ||
| 96 | brp.loop.imp .Ltop,.Lexit-16 };; | ||
| 97 | { .mmi; LDKEY yy=[key] // load key->y | ||
| 98 | add ksch=SZ,key | ||
| 99 | mov ar.lc=in1 } | ||
| 100 | { .mmi; mov key_y[1]=r0 // guarantee inequality | ||
| 101 | // in first iteration | ||
| 102 | add xx=1,xx | ||
| 103 | mov pr.rot=1<<16 };; | ||
| 104 | { .mii; nop.m 0 | ||
| 105 | dep key_x[1]=xx,r0,OFF,8 | ||
| 106 | mov ar.ec=3 };; // note that epilogue counter | ||
| 107 | // is off by 1. I compensate | ||
| 108 | // for this at exit... | ||
| 109 | .Ltop: | ||
| 110 | // The loop is scheduled for 4*(n+2) spin-rate on Itanium 2, which | ||
| 111 | // theoretically gives asymptotic performance of clock frequency | ||
| 112 | // divided by 4 bytes per seconds, or 400MBps on 1.6GHz CPU. This is | ||
| 113 | // for sizeof(RC4_INT)==4. For smaller RC4_INT STKEY inadvertently | ||
| 114 | // splits the last bundle and you end up with 5*n spin-rate:-( | ||
| 115 | // Originally the loop was scheduled for 3*n and relied on key | ||
| 116 | // schedule to be aligned at 256*sizeof(RC4_INT) boundary. But | ||
| 117 | // *(out++)=dat, which maps to st1, had same effect [inadvertent | ||
| 118 | // bundle split] and holded the loop back. Rescheduling for 4*n | ||
| 119 | // made it possible to eliminate dependence on specific alignment | ||
| 120 | // and allow OpenSSH keep "abusing" our API. Reaching for 3*n would | ||
| 121 | // require unrolling, sticking to variable shift instruction for | ||
| 122 | // collecting output [to avoid starvation for integer shifter] and | ||
| 123 | // copying of key schedule to controlled place in stack [so that | ||
| 124 | // deposit instruction can serve as substitute for whole | ||
| 125 | // key->data+((x&255)<<log2(sizeof(key->data[0])))]... | ||
| 126 | { .mmi; (p19) st1 [out]=dat[3],1 // *(out++)=dat | ||
| 127 | (p16) add xx=1,xx // x++ | ||
| 128 | (p18) dep rnd[1]=rnd[1],r0,OFF,8 } // ((tx+ty)&255)<<OFF | ||
| 129 | { .mmi; (p16) add key_x[1]=ksch,key_x[1] // &key[xx&255] | ||
| 130 | (p17) add key_y[1]=ksch,key_y[1] };; // &key[yy&255] | ||
| 131 | { .mmi; (p16) LDKEY tx[0]=[key_x[1]] // tx=key[xx] | ||
| 132 | (p17) LDKEY ty[0]=[key_y[1]] // ty=key[yy] | ||
| 133 | (p16) dep key_x[0]=xx,r0,OFF,8 } // (xx&255)<<OFF | ||
| 134 | { .mmi; (p18) add rnd[1]=ksch,rnd[1] // &key[(tx+ty)&255] | ||
| 135 | (p16) cmp.ne.unc p20,p21=key_x[1],key_y[1] };; | ||
| 136 | { .mmi; (p18) LDKEY rnd[1]=[rnd[1]] // rnd=key[(tx+ty)&255] | ||
| 137 | (p16) ld1 dat[0]=[inp],1 } // dat=*(inp++) | ||
| 138 | .pred.rel "mutex",p20,p21 | ||
| 139 | { .mmi; (p21) add yy=yy,tx[1] // (p16) | ||
| 140 | (p20) add yy=yy,tx[0] // (p16) y+=tx | ||
| 141 | (p21) mov tx[0]=tx[1] };; // (p16) | ||
| 142 | { .mmi; (p17) STKEY [key_y[1]]=tx[1] // key[yy]=tx | ||
| 143 | (p17) STKEY [key_x[2]]=ty[0] // key[xx]=ty | ||
| 144 | (p16) dep key_y[0]=yy,r0,OFF,8 } // &key[yy&255] | ||
| 145 | { .mmb; (p17) add rnd[0]=tx[1],ty[0] // tx+=ty | ||
| 146 | (p18) xor dat[2]=dat[2],rnd[1] // dat^=rnd | ||
| 147 | br.ctop.sptk .Ltop };; | ||
| 148 | .Lexit: | ||
| 149 | { .mib; STKEY [key]=yy,-SZ // save key->y | ||
| 150 | mov pr=prsave,0x1ffff | ||
| 151 | nop.b 0 } | ||
| 152 | { .mib; st1 [out]=dat[3],1 // compensate for truncated | ||
| 153 | // epilogue counter | ||
| 154 | add xx=-1,xx | ||
| 155 | nop.b 0 };; | ||
| 156 | { .mib; STKEY [key]=xx // save key->x | ||
| 157 | mov ar.lc=r3 | ||
| 158 | br.ret.sptk.many b0 };; | ||
| 159 | .endp RC4# | ||
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl index 00c6fa28aa..92c52f3433 100755 --- a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | |||
| @@ -269,8 +269,7 @@ RC4_set_key: | |||
| 269 | xor $ido,$ido | 269 | xor $ido,$ido |
| 270 | xor %r10,%r10 | 270 | xor %r10,%r10 |
| 271 | xor %r11,%r11 | 271 | xor %r11,%r11 |
| 272 | 272 | mov PIC_GOT(OPENSSL_ia32cap_P),$idx#d | |
| 273 | mov OPENSSL_ia32cap_P(%rip),$idx#d | ||
| 274 | bt \$20,$idx#d | 273 | bt \$20,$idx#d |
| 275 | jnc .Lw1stloop | 274 | jnc .Lw1stloop |
| 276 | bt \$30,$idx#d | 275 | bt \$30,$idx#d |
| @@ -338,7 +337,7 @@ RC4_set_key: | |||
| 338 | RC4_options: | 337 | RC4_options: |
| 339 | .picmeup %rax | 338 | .picmeup %rax |
| 340 | lea .Lopts-.(%rax),%rax | 339 | lea .Lopts-.(%rax),%rax |
| 341 | mov OPENSSL_ia32cap_P(%rip),%edx | 340 | mov PIC_GOT(OPENSSL_ia32cap_P),%edx |
| 342 | bt \$20,%edx | 341 | bt \$20,%edx |
| 343 | jnc .Ldone | 342 | jnc .Ldone |
| 344 | add \$12,%rax | 343 | add \$12,%rax |
| @@ -359,8 +358,6 @@ ___ | |||
| 359 | 358 | ||
| 360 | $code =~ s/#([bwd])/$1/gm; | 359 | $code =~ s/#([bwd])/$1/gm; |
| 361 | 360 | ||
| 362 | $code =~ s/RC4_set_key/private_RC4_set_key/g if ($ENV{FIPSCANLIB} ne ""); | ||
| 363 | |||
| 364 | print $code; | 361 | print $code; |
| 365 | 362 | ||
| 366 | close STDOUT; | 363 | close STDOUT; |
