1 files changed, 0 insertions, 582 deletions
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
deleted file mode 100644
index 7faf37b147..0000000000
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ /dev/null
@@ -1,582 +0,0 @@
-#!/usr/bin/env perl
-# ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-# SHA512 block procedure for ARMv4. September 2007.
-# This code is ~4.5 (four and a half) times faster than code generated
-# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
-# Xscale PXA250 core].
-#
-# July 2010.
-#
-# Rescheduling for dual-issue pipeline resulted in 6% improvement on
-# Cortex A8 core and ~40 cycles per processed byte.
-# February 2011.
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Coxtex A8 core and ~38 cycles per byte.
-# March 2011.
-#
-# Add NEON implementation. On Cortex A8 it was measured to process
-# one byte in 25.5 cycles or 47% faster than integer-only code.
-# Byte order [in]dependence. =========================================
-#
-# Originally caller was expected to maintain specific *dword* order in
-# h[0-7], namely with most significant dword at *lower* address, which
-# was reflected in below two parameters as 0 and 4. Now caller is
-# expected to maintain native byte order for whole 64-bit values.
-$hi="HI";
-$lo="LO";
-# ====================================================================
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-$ctx="r0";      # parameter block
-$inp="r1";
-$len="r2";
-$Tlo="r3";
-$Thi="r4";
-$Alo="r5";
-$Ahi="r6";
-$Elo="r7";
-$Ehi="r8";
-$t0="r9";
-$t1="r10";
-$t2="r11";
-$t3="r12";
-############    r13 is stack pointer
-$Ktbl="r14";
-############    r15 is program counter
-$Aoff=8*0;
-$Boff=8*1;
-$Coff=8*2;
-$Doff=8*3;
-$Eoff=8*4;
-$Foff=8*5;
-$Goff=8*6;
-$Hoff=8*7;
-$Xoff=8*8;
-sub BODY_00_15() {
-my $magic = shift;
-$code.=<<___;
-        @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
-        @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
-        @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
-        mov     $t0,$Elo,lsr#14
-        str     $Tlo,[sp,#$Xoff+0]
-        mov     $t1,$Ehi,lsr#14
-        str     $Thi,[sp,#$Xoff+4]
-        eor     $t0,$t0,$Ehi,lsl#18
-        ldr     $t2,[sp,#$Hoff+0]       @ h.lo
-        eor     $t1,$t1,$Elo,lsl#18
-        ldr     $t3,[sp,#$Hoff+4]       @ h.hi
-        eor     $t0,$t0,$Elo,lsr#18
-        eor     $t1,$t1,$Ehi,lsr#18
-        eor     $t0,$t0,$Ehi,lsl#14
-        eor     $t1,$t1,$Elo,lsl#14
-        eor     $t0,$t0,$Ehi,lsr#9
-        eor     $t1,$t1,$Elo,lsr#9
-        eor     $t0,$t0,$Elo,lsl#23
-        eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
-        adds    $Tlo,$Tlo,$t0
-        ldr     $t0,[sp,#$Foff+0]       @ f.lo
-        adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
-        ldr     $t1,[sp,#$Foff+4]       @ f.hi
-        adds    $Tlo,$Tlo,$t2
-        ldr     $t2,[sp,#$Goff+0]       @ g.lo
-        adc     $Thi,$Thi,$t3           @ T += h
-        ldr     $t3,[sp,#$Goff+4]       @ g.hi
-        eor     $t0,$t0,$t2
-        str     $Elo,[sp,#$Eoff+0]
-        eor     $t1,$t1,$t3
-        str     $Ehi,[sp,#$Eoff+4]
-        and     $t0,$t0,$Elo
-        str     $Alo,[sp,#$Aoff+0]
-        and     $t1,$t1,$Ehi
-        str     $Ahi,[sp,#$Aoff+4]
-        eor     $t0,$t0,$t2
-        ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
-        eor     $t1,$t1,$t3             @ Ch(e,f,g)
-        ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
-        adds    $Tlo,$Tlo,$t0
-        ldr     $Elo,[sp,#$Doff+0]      @ d.lo
-        adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
-        ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
-        adds    $Tlo,$Tlo,$t2
-        and     $t0,$t2,#0xff
-        adc     $Thi,$Thi,$t3           @ T += K[i]
-        adds    $Elo,$Elo,$Tlo
-        ldr     $t2,[sp,#$Boff+0]       @ b.lo
-        adc     $Ehi,$Ehi,$Thi          @ d += T
-        teq     $t0,#$magic
-        ldr     $t3,[sp,#$Coff+0]       @ c.lo
-        orreq   $Ktbl,$Ktbl,#1
-        @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
-        @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
-        @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
-        mov     $t0,$Alo,lsr#28
-        mov     $t1,$Ahi,lsr#28
-        eor     $t0,$t0,$Ahi,lsl#4
-        eor     $t1,$t1,$Alo,lsl#4
-        eor     $t0,$t0,$Ahi,lsr#2
-        eor     $t1,$t1,$Alo,lsr#2
-        eor     $t0,$t0,$Alo,lsl#30
-        eor     $t1,$t1,$Ahi,lsl#30
-        eor     $t0,$t0,$Ahi,lsr#7
-        eor     $t1,$t1,$Alo,lsr#7
-        eor     $t0,$t0,$Alo,lsl#25
-        eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
-        adds    $Tlo,$Tlo,$t0
-        and     $t0,$Alo,$t2
-        adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
-        ldr     $t1,[sp,#$Boff+4]       @ b.hi
-        orr     $Alo,$Alo,$t2
-        ldr     $t2,[sp,#$Coff+4]       @ c.hi
-        and     $Alo,$Alo,$t3
-        and     $t3,$Ahi,$t1
-        orr     $Ahi,$Ahi,$t1
-        orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
-        and     $Ahi,$Ahi,$t2
-        adds    $Alo,$Alo,$Tlo
-        orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
-        sub     sp,sp,#8
-        adc     $Ahi,$Ahi,$Thi          @ h += T
-        tst     $Ktbl,#1
-        add     $Ktbl,$Ktbl,#8
-___
-}
-$code=<<___;
-#include "arm_arch.h"
-#ifdef __ARMEL__
-# define LO 0
-# define HI 4
-# define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
-#else
-# define HI 0
-# define LO 4
-# define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
-#endif
-.text
-.code   32
-.type   K512,%object
-.align  5
-K512:
-WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
-WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
-WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
-WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
-WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
-WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
-WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
-WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
-WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
-WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
-WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
-WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
-WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
-WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
-WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
-WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
-WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
-WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
-WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
-WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
-WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
-WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
-WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
-WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
-WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
-WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
-WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
-WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
-WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
-WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
-WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
-WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
-WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
-WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
-WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
-WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
-WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
-WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
-WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
-WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
-.size   K512,.-K512
-.LOPENSSL_armcap:
-.word   OPENSSL_armcap_P-sha512_block_data_order
-.skip   32-4
-.global sha512_block_data_order
-.type   sha512_block_data_order,%function
-sha512_block_data_order:
-        sub     r3,pc,#8                @ sha512_block_data_order
-        add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
-#if __ARM_ARCH__>=7
-        ldr     r12,.LOPENSSL_armcap
-        ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
-        tst     r12,#1
-        bne     .LNEON
-#endif
-        stmdb   sp!,{r4-r12,lr}
-        sub     $Ktbl,r3,#672           @ K512
-        sub     sp,sp,#9*8
-        ldr     $Elo,[$ctx,#$Eoff+$lo]
-        ldr     $Ehi,[$ctx,#$Eoff+$hi]
-        ldr     $t0, [$ctx,#$Goff+$lo]
-        ldr     $t1, [$ctx,#$Goff+$hi]
-        ldr     $t2, [$ctx,#$Hoff+$lo]
-        ldr     $t3, [$ctx,#$Hoff+$hi]
-.Loop:
-        str     $t0, [sp,#$Goff+0]
-        str     $t1, [sp,#$Goff+4]
-        str     $t2, [sp,#$Hoff+0]
-        str     $t3, [sp,#$Hoff+4]
-        ldr     $Alo,[$ctx,#$Aoff+$lo]
-        ldr     $Ahi,[$ctx,#$Aoff+$hi]
-        ldr     $Tlo,[$ctx,#$Boff+$lo]
-        ldr     $Thi,[$ctx,#$Boff+$hi]
-        ldr     $t0, [$ctx,#$Coff+$lo]
-        ldr     $t1, [$ctx,#$Coff+$hi]
-        ldr     $t2, [$ctx,#$Doff+$lo]
-        ldr     $t3, [$ctx,#$Doff+$hi]
-        str     $Tlo,[sp,#$Boff+0]
-        str     $Thi,[sp,#$Boff+4]
-        str     $t0, [sp,#$Coff+0]
-        str     $t1, [sp,#$Coff+4]
-        str     $t2, [sp,#$Doff+0]
-        str     $t3, [sp,#$Doff+4]
-        ldr     $Tlo,[$ctx,#$Foff+$lo]
-        ldr     $Thi,[$ctx,#$Foff+$hi]
-        str     $Tlo,[sp,#$Foff+0]
-        str     $Thi,[sp,#$Foff+4]
-.L00_15:
-#if __ARM_ARCH__<7
-        ldrb    $Tlo,[$inp,#7]
-        ldrb    $t0, [$inp,#6]
-        ldrb    $t1, [$inp,#5]
-        ldrb    $t2, [$inp,#4]
-        ldrb    $Thi,[$inp,#3]
-        ldrb    $t3, [$inp,#2]
-        orr     $Tlo,$Tlo,$t0,lsl#8
-        ldrb    $t0, [$inp,#1]
-        orr     $Tlo,$Tlo,$t1,lsl#16
-        ldrb    $t1, [$inp],#8
-        orr     $Tlo,$Tlo,$t2,lsl#24
-        orr     $Thi,$Thi,$t3,lsl#8
-        orr     $Thi,$Thi,$t0,lsl#16
-        orr     $Thi,$Thi,$t1,lsl#24
-#else
-        ldr     $Tlo,[$inp,#4]
-        ldr     $Thi,[$inp],#8
-#ifdef __ARMEL__
-        rev     $Tlo,$Tlo
-        rev     $Thi,$Thi
-#endif
-#endif
-___
-        &BODY_00_15(0x94);
-$code.=<<___;
-        tst     $Ktbl,#1
-        beq     .L00_15
-        ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
-        ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
-        bic     $Ktbl,$Ktbl,#1
-.L16_79:
-        @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
-        @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
-        @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
-        mov     $Tlo,$t0,lsr#1
-        ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
-        mov     $Thi,$t1,lsr#1
-        ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
-        eor     $Tlo,$Tlo,$t1,lsl#31
-        eor     $Thi,$Thi,$t0,lsl#31
-        eor     $Tlo,$Tlo,$t0,lsr#8
-        eor     $Thi,$Thi,$t1,lsr#8
-        eor     $Tlo,$Tlo,$t1,lsl#24
-        eor     $Thi,$Thi,$t0,lsl#24
-        eor     $Tlo,$Tlo,$t0,lsr#7
-        eor     $Thi,$Thi,$t1,lsr#7
-        eor     $Tlo,$Tlo,$t1,lsl#25
-        @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
-        @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
-        @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
-        mov     $t0,$t2,lsr#19
-        mov     $t1,$t3,lsr#19
-        eor     $t0,$t0,$t3,lsl#13
-        eor     $t1,$t1,$t2,lsl#13
-        eor     $t0,$t0,$t3,lsr#29
-        eor     $t1,$t1,$t2,lsr#29
-        eor     $t0,$t0,$t2,lsl#3
-        eor     $t1,$t1,$t3,lsl#3
-        eor     $t0,$t0,$t2,lsr#6
-        eor     $t1,$t1,$t3,lsr#6
-        ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
-        eor     $t0,$t0,$t3,lsl#26
-        ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
-        adds    $Tlo,$Tlo,$t0
-        ldr     $t0,[sp,#`$Xoff+8*16`+0]
-        adc     $Thi,$Thi,$t1
-        ldr     $t1,[sp,#`$Xoff+8*16`+4]
-        adds    $Tlo,$Tlo,$t2
-        adc     $Thi,$Thi,$t3
-        adds    $Tlo,$Tlo,$t0
-        adc     $Thi,$Thi,$t1
-___
-        &BODY_00_15(0x17);
-$code.=<<___;
-        ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
-        ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
-        beq     .L16_79
-        bic     $Ktbl,$Ktbl,#1
-        ldr     $Tlo,[sp,#$Boff+0]
-        ldr     $Thi,[sp,#$Boff+4]
-        ldr     $t0, [$ctx,#$Aoff+$lo]
-        ldr     $t1, [$ctx,#$Aoff+$hi]
-        ldr     $t2, [$ctx,#$Boff+$lo]
-        ldr     $t3, [$ctx,#$Boff+$hi]
-        adds    $t0,$Alo,$t0
-        str     $t0, [$ctx,#$Aoff+$lo]
-        adc     $t1,$Ahi,$t1
-        str     $t1, [$ctx,#$Aoff+$hi]
-        adds    $t2,$Tlo,$t2
-        str     $t2, [$ctx,#$Boff+$lo]
-        adc     $t3,$Thi,$t3
-        str     $t3, [$ctx,#$Boff+$hi]
-        ldr     $Alo,[sp,#$Coff+0]
-        ldr     $Ahi,[sp,#$Coff+4]
-        ldr     $Tlo,[sp,#$Doff+0]
-        ldr     $Thi,[sp,#$Doff+4]
-        ldr     $t0, [$ctx,#$Coff+$lo]
-        ldr     $t1, [$ctx,#$Coff+$hi]
-        ldr     $t2, [$ctx,#$Doff+$lo]
-        ldr     $t3, [$ctx,#$Doff+$hi]
-        adds    $t0,$Alo,$t0
-        str     $t0, [$ctx,#$Coff+$lo]
-        adc     $t1,$Ahi,$t1
-        str     $t1, [$ctx,#$Coff+$hi]
-        adds    $t2,$Tlo,$t2
-        str     $t2, [$ctx,#$Doff+$lo]
-        adc     $t3,$Thi,$t3
-        str     $t3, [$ctx,#$Doff+$hi]
-        ldr     $Tlo,[sp,#$Foff+0]
-        ldr     $Thi,[sp,#$Foff+4]
-        ldr     $t0, [$ctx,#$Eoff+$lo]
-        ldr     $t1, [$ctx,#$Eoff+$hi]
-        ldr     $t2, [$ctx,#$Foff+$lo]
-        ldr     $t3, [$ctx,#$Foff+$hi]
-        adds    $Elo,$Elo,$t0
-        str     $Elo,[$ctx,#$Eoff+$lo]
-        adc     $Ehi,$Ehi,$t1
-        str     $Ehi,[$ctx,#$Eoff+$hi]
-        adds    $t2,$Tlo,$t2
-        str     $t2, [$ctx,#$Foff+$lo]
-        adc     $t3,$Thi,$t3
-        str     $t3, [$ctx,#$Foff+$hi]
-        ldr     $Alo,[sp,#$Goff+0]
-        ldr     $Ahi,[sp,#$Goff+4]
-        ldr     $Tlo,[sp,#$Hoff+0]
-        ldr     $Thi,[sp,#$Hoff+4]
-        ldr     $t0, [$ctx,#$Goff+$lo]
-        ldr     $t1, [$ctx,#$Goff+$hi]
-        ldr     $t2, [$ctx,#$Hoff+$lo]
-        ldr     $t3, [$ctx,#$Hoff+$hi]
-        adds    $t0,$Alo,$t0
-        str     $t0, [$ctx,#$Goff+$lo]
-        adc     $t1,$Ahi,$t1
-        str     $t1, [$ctx,#$Goff+$hi]
-        adds    $t2,$Tlo,$t2
-        str     $t2, [$ctx,#$Hoff+$lo]
-        adc     $t3,$Thi,$t3
-        str     $t3, [$ctx,#$Hoff+$hi]
-        add     sp,sp,#640
-        sub     $Ktbl,$Ktbl,#640
-        teq     $inp,$len
-        bne     .Loop
-        add     sp,sp,#8*9              @ destroy frame
-#if __ARM_ARCH__>=5
-        ldmia   sp!,{r4-r12,pc}
-#else
-        ldmia   sp!,{r4-r12,lr}
-        tst     lr,#1
-        moveq   pc,lr                   @ be binary compatible with V4, yet
-        bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-___
-{
-my @Sigma0=(28,34,39);
-my @Sigma1=(14,18,41);
-my @sigma0=(1, 8, 7);
-my @sigma1=(19,61,6);
-my $Ktbl="r3";
-my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
-my @X=map("d$_",(0..15));
-my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
-sub NEON_00_15() {
-my $i=shift;
-my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
-my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
-$code.=<<___ if ($i<16 || $i&1);
-        vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
-#if $i<16
-        vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
-#endif
-        vshr.u64        $t1,$e,#@Sigma1[1]
-        vshr.u64        $t2,$e,#@Sigma1[2]
-___
-$code.=<<___;
-        vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
-        vsli.64         $t0,$e,#`64-@Sigma1[0]`
-        vsli.64         $t1,$e,#`64-@Sigma1[1]`
-        vsli.64         $t2,$e,#`64-@Sigma1[2]`
-#if $i<16 && defined(__ARMEL__)
-        vrev64.8        @X[$i],@X[$i]
-#endif
-        vadd.i64        $T1,$K,$h
-        veor            $Ch,$f,$g
-        veor            $t0,$t1
-        vand            $Ch,$e
-        veor            $t0,$t2                 @ Sigma1(e)
-        veor            $Ch,$g                  @ Ch(e,f,g)
-        vadd.i64        $T1,$t0
-        vshr.u64        $t0,$a,#@Sigma0[0]
-        vadd.i64        $T1,$Ch
-        vshr.u64        $t1,$a,#@Sigma0[1]
-        vshr.u64        $t2,$a,#@Sigma0[2]
-        vsli.64         $t0,$a,#`64-@Sigma0[0]`
-        vsli.64         $t1,$a,#`64-@Sigma0[1]`
-        vsli.64         $t2,$a,#`64-@Sigma0[2]`
-        vadd.i64        $T1,@X[$i%16]
-        vorr            $Maj,$a,$c
-        vand            $Ch,$a,$c
-        veor            $h,$t0,$t1
-        vand            $Maj,$b
-        veor            $h,$t2                  @ Sigma0(a)
-        vorr            $Maj,$Ch                @ Maj(a,b,c)
-        vadd.i64        $h,$T1
-        vadd.i64        $d,$T1
-        vadd.i64        $h,$Maj
-___
-}
-sub NEON_16_79() {
-my $i=shift;
-if ($i&1)       { &NEON_00_15($i,@_); return; }
-# 2x-vectorized, therefore runs every 2nd round
-my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
-my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
-my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
-my $e=@_[4];                                    # $e from NEON_00_15
-$i /= 2;
-$code.=<<___;
-        vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
-        vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
-        vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
-        vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
-        vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
-        vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
-        veor            $s1,$t0
-        vshr.u64        $t0,$s0,#@sigma0[0]
-        veor            $s1,$t1                         @ sigma1(X[i+14])
-        vshr.u64        $t1,$s0,#@sigma0[1]
-        vadd.i64        @X[$i%8],$s1
-        vshr.u64        $s1,$s0,#@sigma0[2]
-        vsli.64         $t0,$s0,#`64-@sigma0[0]`
-        vsli.64         $t1,$s0,#`64-@sigma0[1]`
-        vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
-        veor            $s1,$t0
-        vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
-        vadd.i64        @X[$i%8],$s0
-        vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
-        veor            $s1,$t1                         @ sigma0(X[i+1])
-        vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
-        vadd.i64        @X[$i%8],$s1
-___
-        &NEON_00_15(2*$i,@_);
-}
-$code.=<<___;
-#if __ARM_ARCH__>=7
-.fpu    neon
-.align  4
-.LNEON:
-        dmb                             @ errata #451034 on early Cortex A8
-        vstmdb  sp!,{d8-d15}            @ ABI specification says so
-        sub     $Ktbl,r3,#672           @ K512
-        vldmia  $ctx,{$A-$H}            @ load context
-.Loop_neon:
-___
-for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-        mov             $cnt,#4
-.L16_79_neon:
-        subs            $cnt,#1
-___
-for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
-$code.=<<___;
-        bne             .L16_79_neon
-        vldmia          $ctx,{d24-d31}  @ load context to temp
-        vadd.i64        q8,q12          @ vectorized accumulate
-        vadd.i64        q9,q13
-        vadd.i64        q10,q14
-        vadd.i64        q11,q15
-        vstmia          $ctx,{$A-$H}    @ save context
-        teq             $inp,$len
-        sub             $Ktbl,#640      @ rewind K512
-        bne             .Loop_neon
-        vldmia  sp!,{d8-d15}            @ epilogue
-        bx      lr
-#endif
-___
-}
-$code.=<<___;
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align  2
-.comm   OPENSSL_armcap_P,4,4
-___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
-print $code;
-close STDOUT; # enforce flush

diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl deleted file mode 100644 index 7faf37b147..0000000000 --- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl +++ /dev/null
@@ -1,582 +0,0 @@
1	#!/usr/bin/env perl
2
3	# ====================================================================
4	# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5	# project. The module is, however, dual licensed under OpenSSL and
6	# CRYPTOGAMS licenses depending on where you obtain it. For further
7	# details see http://www.openssl.org/~appro/cryptogams/.
8	# ====================================================================
9
10	# SHA512 block procedure for ARMv4. September 2007.
11
12	# This code is ~4.5 (four and a half) times faster than code generated
13	# by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14	# Xscale PXA250 core].
15	#
16	# July 2010.
17	#
18	# Rescheduling for dual-issue pipeline resulted in 6% improvement on
19	# Cortex A8 core and ~40 cycles per processed byte.
20
21	# February 2011.
22	#
23	# Profiler-assisted and platform-specific optimization resulted in 7%
24	# improvement on Coxtex A8 core and ~38 cycles per byte.
25
26	# March 2011.
27	#
28	# Add NEON implementation. On Cortex A8 it was measured to process
29	# one byte in 25.5 cycles or 47% faster than integer-only code.
30
31	# Byte order [in]dependence. =========================================
32	#
33	# Originally caller was expected to maintain specific dword order in
34	# h[0-7], namely with most significant dword at lower address, which
35	# was reflected in below two parameters as 0 and 4. Now caller is
36	# expected to maintain native byte order for whole 64-bit values.
37	$hi="HI";
38	$lo="LO";
39	# ====================================================================
40
41	while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42	open STDOUT,">$output";
43
44	$ctx="r0"; # parameter block
45	$inp="r1";
46	$len="r2";
47
48	$Tlo="r3";
49	$Thi="r4";
50	$Alo="r5";
51	$Ahi="r6";
52	$Elo="r7";
53	$Ehi="r8";
54	$t0="r9";
55	$t1="r10";
56	$t2="r11";
57	$t3="r12";
58	############ r13 is stack pointer
59	$Ktbl="r14";
60	############ r15 is program counter
61
62	$Aoff=8*0;
63	$Boff=8*1;
64	$Coff=8*2;
65	$Doff=8*3;
66	$Eoff=8*4;
67	$Foff=8*5;
68	$Goff=8*6;
69	$Hoff=8*7;
70	$Xoff=8*8;
71
72	sub BODY_00_15() {
73	my $magic = shift;
74	$code.=<<___;
75	@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76	@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77	@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
78	mov $t0,$Elo,lsr#14
79	str $Tlo,[sp,#$Xoff+0]
80	mov $t1,$Ehi,lsr#14
81	str $Thi,[sp,#$Xoff+4]
82	eor $t0,$t0,$Ehi,lsl#18
83	ldr $t2,[sp,#$Hoff+0] @ h.lo
84	eor $t1,$t1,$Elo,lsl#18
85	ldr $t3,[sp,#$Hoff+4] @ h.hi
86	eor $t0,$t0,$Elo,lsr#18
87	eor $t1,$t1,$Ehi,lsr#18
88	eor $t0,$t0,$Ehi,lsl#14
89	eor $t1,$t1,$Elo,lsl#14
90	eor $t0,$t0,$Ehi,lsr#9
91	eor $t1,$t1,$Elo,lsr#9
92	eor $t0,$t0,$Elo,lsl#23
93	eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
94	adds $Tlo,$Tlo,$t0
95	ldr $t0,[sp,#$Foff+0] @ f.lo
96	adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97	ldr $t1,[sp,#$Foff+4] @ f.hi
98	adds $Tlo,$Tlo,$t2
99	ldr $t2,[sp,#$Goff+0] @ g.lo
100	adc $Thi,$Thi,$t3 @ T += h
101	ldr $t3,[sp,#$Goff+4] @ g.hi
102
103	eor $t0,$t0,$t2
104	str $Elo,[sp,#$Eoff+0]
105	eor $t1,$t1,$t3
106	str $Ehi,[sp,#$Eoff+4]
107	and $t0,$t0,$Elo
108	str $Alo,[sp,#$Aoff+0]
109	and $t1,$t1,$Ehi
110	str $Ahi,[sp,#$Aoff+4]
111	eor $t0,$t0,$t2
112	ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113	eor $t1,$t1,$t3 @ Ch(e,f,g)
114	ldr $t3,[$Ktbl,#$hi] @ K[i].hi
115
116	adds $Tlo,$Tlo,$t0
117	ldr $Elo,[sp,#$Doff+0] @ d.lo
118	adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119	ldr $Ehi,[sp,#$Doff+4] @ d.hi
120	adds $Tlo,$Tlo,$t2
121	and $t0,$t2,#0xff
122	adc $Thi,$Thi,$t3 @ T += K[i]
123	adds $Elo,$Elo,$Tlo
124	ldr $t2,[sp,#$Boff+0] @ b.lo
125	adc $Ehi,$Ehi,$Thi @ d += T
126	teq $t0,#$magic
127
128	ldr $t3,[sp,#$Coff+0] @ c.lo
129	orreq $Ktbl,$Ktbl,#1
130	@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131	@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132	@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
133	mov $t0,$Alo,lsr#28
134	mov $t1,$Ahi,lsr#28
135	eor $t0,$t0,$Ahi,lsl#4
136	eor $t1,$t1,$Alo,lsl#4
137	eor $t0,$t0,$Ahi,lsr#2
138	eor $t1,$t1,$Alo,lsr#2
139	eor $t0,$t0,$Alo,lsl#30
140	eor $t1,$t1,$Ahi,lsl#30
141	eor $t0,$t0,$Ahi,lsr#7
142	eor $t1,$t1,$Alo,lsr#7
143	eor $t0,$t0,$Alo,lsl#25
144	eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
145	adds $Tlo,$Tlo,$t0
146	and $t0,$Alo,$t2
147	adc $Thi,$Thi,$t1 @ T += Sigma0(a)
148
149	ldr $t1,[sp,#$Boff+4] @ b.hi
150	orr $Alo,$Alo,$t2
151	ldr $t2,[sp,#$Coff+4] @ c.hi
152	and $Alo,$Alo,$t3
153	and $t3,$Ahi,$t1
154	orr $Ahi,$Ahi,$t1
155	orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
156	and $Ahi,$Ahi,$t2
157	adds $Alo,$Alo,$Tlo
158	orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
159	sub sp,sp,#8
160	adc $Ahi,$Ahi,$Thi @ h += T
161	tst $Ktbl,#1
162	add $Ktbl,$Ktbl,#8
163	___
164	}
165	$code=<<___;
166	#include "arm_arch.h"
167	#ifdef __ARMEL__
168	# define LO 0
169	# define HI 4
170	# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
171	#else
172	# define HI 0
173	# define LO 4
174	# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
175	#endif
176
177	.text
178	.code 32
179	.type K512,%object
180	.align 5
181	K512:
182	WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183	WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184	WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185	WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186	WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187	WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188	WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189	WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190	WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191	WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192	WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193	WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194	WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195	WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196	WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197	WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198	WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199	WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200	WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201	WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202	WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203	WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204	WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205	WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206	WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207	WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208	WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209	WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210	WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211	WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212	WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213	WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214	WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215	WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216	WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217	WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218	WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219	WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220	WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221	WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
222	.size K512,.-K512
223	.LOPENSSL_armcap:
224	.word OPENSSL_armcap_P-sha512_block_data_order
225	.skip 32-4
226
227	.global sha512_block_data_order
228	.type sha512_block_data_order,%function
229	sha512_block_data_order:
230	sub r3,pc,#8 @ sha512_block_data_order
231	add $len,$inp,$len,lsl#7 @ len to point at the end of inp
232	#if __ARM_ARCH__>=7
233	ldr r12,.LOPENSSL_armcap
234	ldr r12,[r3,r12] @ OPENSSL_armcap_P
235	tst r12,#1
236	bne .LNEON
237	#endif
238	stmdb sp!,{r4-r12,lr}
239	sub $Ktbl,r3,#672 @ K512
240	sub sp,sp,#9*8
241
242	ldr $Elo,[$ctx,#$Eoff+$lo]
243	ldr $Ehi,[$ctx,#$Eoff+$hi]
244	ldr $t0, [$ctx,#$Goff+$lo]
245	ldr $t1, [$ctx,#$Goff+$hi]
246	ldr $t2, [$ctx,#$Hoff+$lo]
247	ldr $t3, [$ctx,#$Hoff+$hi]
248	.Loop:
249	str $t0, [sp,#$Goff+0]
250	str $t1, [sp,#$Goff+4]
251	str $t2, [sp,#$Hoff+0]
252	str $t3, [sp,#$Hoff+4]
253	ldr $Alo,[$ctx,#$Aoff+$lo]
254	ldr $Ahi,[$ctx,#$Aoff+$hi]
255	ldr $Tlo,[$ctx,#$Boff+$lo]
256	ldr $Thi,[$ctx,#$Boff+$hi]
257	ldr $t0, [$ctx,#$Coff+$lo]
258	ldr $t1, [$ctx,#$Coff+$hi]
259	ldr $t2, [$ctx,#$Doff+$lo]
260	ldr $t3, [$ctx,#$Doff+$hi]
261	str $Tlo,[sp,#$Boff+0]
262	str $Thi,[sp,#$Boff+4]
263	str $t0, [sp,#$Coff+0]
264	str $t1, [sp,#$Coff+4]
265	str $t2, [sp,#$Doff+0]
266	str $t3, [sp,#$Doff+4]
267	ldr $Tlo,[$ctx,#$Foff+$lo]
268	ldr $Thi,[$ctx,#$Foff+$hi]
269	str $Tlo,[sp,#$Foff+0]
270	str $Thi,[sp,#$Foff+4]
271
272	.L00_15:
273	#if __ARM_ARCH__<7
274	ldrb $Tlo,[$inp,#7]
275	ldrb $t0, [$inp,#6]
276	ldrb $t1, [$inp,#5]
277	ldrb $t2, [$inp,#4]
278	ldrb $Thi,[$inp,#3]
279	ldrb $t3, [$inp,#2]
280	orr $Tlo,$Tlo,$t0,lsl#8
281	ldrb $t0, [$inp,#1]
282	orr $Tlo,$Tlo,$t1,lsl#16
283	ldrb $t1, [$inp],#8
284	orr $Tlo,$Tlo,$t2,lsl#24
285	orr $Thi,$Thi,$t3,lsl#8
286	orr $Thi,$Thi,$t0,lsl#16
287	orr $Thi,$Thi,$t1,lsl#24
288	#else
289	ldr $Tlo,[$inp,#4]
290	ldr $Thi,[$inp],#8
291	#ifdef __ARMEL__
292	rev $Tlo,$Tlo
293	rev $Thi,$Thi
294	#endif
295	#endif
296	___
297	&BODY_00_15(0x94);
298	$code.=<<___;
299	tst $Ktbl,#1
300	beq .L00_15
301	ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302	ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
303	bic $Ktbl,$Ktbl,#1
304	.L16_79:
305	@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306	@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307	@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
308	mov $Tlo,$t0,lsr#1
309	ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
310	mov $Thi,$t1,lsr#1
311	ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312	eor $Tlo,$Tlo,$t1,lsl#31
313	eor $Thi,$Thi,$t0,lsl#31
314	eor $Tlo,$Tlo,$t0,lsr#8
315	eor $Thi,$Thi,$t1,lsr#8
316	eor $Tlo,$Tlo,$t1,lsl#24
317	eor $Thi,$Thi,$t0,lsl#24
318	eor $Tlo,$Tlo,$t0,lsr#7
319	eor $Thi,$Thi,$t1,lsr#7
320	eor $Tlo,$Tlo,$t1,lsl#25
321
322	@ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323	@ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324	@ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
325	mov $t0,$t2,lsr#19
326	mov $t1,$t3,lsr#19
327	eor $t0,$t0,$t3,lsl#13
328	eor $t1,$t1,$t2,lsl#13
329	eor $t0,$t0,$t3,lsr#29
330	eor $t1,$t1,$t2,lsr#29
331	eor $t0,$t0,$t2,lsl#3
332	eor $t1,$t1,$t3,lsl#3
333	eor $t0,$t0,$t2,lsr#6
334	eor $t1,$t1,$t3,lsr#6
335	ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336	eor $t0,$t0,$t3,lsl#26
337
338	ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
339	adds $Tlo,$Tlo,$t0
340	ldr $t0,[sp,#`$Xoff+8*16`+0]
341	adc $Thi,$Thi,$t1
342
343	ldr $t1,[sp,#`$Xoff+8*16`+4]
344	adds $Tlo,$Tlo,$t2
345	adc $Thi,$Thi,$t3
346	adds $Tlo,$Tlo,$t0
347	adc $Thi,$Thi,$t1
348	___
349	&BODY_00_15(0x17);
350	$code.=<<___;
351	ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352	ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
353	beq .L16_79
354	bic $Ktbl,$Ktbl,#1
355
356	ldr $Tlo,[sp,#$Boff+0]
357	ldr $Thi,[sp,#$Boff+4]
358	ldr $t0, [$ctx,#$Aoff+$lo]
359	ldr $t1, [$ctx,#$Aoff+$hi]
360	ldr $t2, [$ctx,#$Boff+$lo]
361	ldr $t3, [$ctx,#$Boff+$hi]
362	adds $t0,$Alo,$t0
363	str $t0, [$ctx,#$Aoff+$lo]
364	adc $t1,$Ahi,$t1
365	str $t1, [$ctx,#$Aoff+$hi]
366	adds $t2,$Tlo,$t2
367	str $t2, [$ctx,#$Boff+$lo]
368	adc $t3,$Thi,$t3
369	str $t3, [$ctx,#$Boff+$hi]
370
371	ldr $Alo,[sp,#$Coff+0]
372	ldr $Ahi,[sp,#$Coff+4]
373	ldr $Tlo,[sp,#$Doff+0]
374	ldr $Thi,[sp,#$Doff+4]
375	ldr $t0, [$ctx,#$Coff+$lo]
376	ldr $t1, [$ctx,#$Coff+$hi]
377	ldr $t2, [$ctx,#$Doff+$lo]
378	ldr $t3, [$ctx,#$Doff+$hi]
379	adds $t0,$Alo,$t0
380	str $t0, [$ctx,#$Coff+$lo]
381	adc $t1,$Ahi,$t1
382	str $t1, [$ctx,#$Coff+$hi]
383	adds $t2,$Tlo,$t2
384	str $t2, [$ctx,#$Doff+$lo]
385	adc $t3,$Thi,$t3
386	str $t3, [$ctx,#$Doff+$hi]
387
388	ldr $Tlo,[sp,#$Foff+0]
389	ldr $Thi,[sp,#$Foff+4]
390	ldr $t0, [$ctx,#$Eoff+$lo]
391	ldr $t1, [$ctx,#$Eoff+$hi]
392	ldr $t2, [$ctx,#$Foff+$lo]
393	ldr $t3, [$ctx,#$Foff+$hi]
394	adds $Elo,$Elo,$t0
395	str $Elo,[$ctx,#$Eoff+$lo]
396	adc $Ehi,$Ehi,$t1
397	str $Ehi,[$ctx,#$Eoff+$hi]
398	adds $t2,$Tlo,$t2
399	str $t2, [$ctx,#$Foff+$lo]
400	adc $t3,$Thi,$t3
401	str $t3, [$ctx,#$Foff+$hi]
402
403	ldr $Alo,[sp,#$Goff+0]
404	ldr $Ahi,[sp,#$Goff+4]
405	ldr $Tlo,[sp,#$Hoff+0]
406	ldr $Thi,[sp,#$Hoff+4]
407	ldr $t0, [$ctx,#$Goff+$lo]
408	ldr $t1, [$ctx,#$Goff+$hi]
409	ldr $t2, [$ctx,#$Hoff+$lo]
410	ldr $t3, [$ctx,#$Hoff+$hi]
411	adds $t0,$Alo,$t0
412	str $t0, [$ctx,#$Goff+$lo]
413	adc $t1,$Ahi,$t1
414	str $t1, [$ctx,#$Goff+$hi]
415	adds $t2,$Tlo,$t2
416	str $t2, [$ctx,#$Hoff+$lo]
417	adc $t3,$Thi,$t3
418	str $t3, [$ctx,#$Hoff+$hi]
419
420	add sp,sp,#640
421	sub $Ktbl,$Ktbl,#640
422
423	teq $inp,$len
424	bne .Loop
425
426	add sp,sp,#8*9 @ destroy frame
427	#if __ARM_ARCH__>=5
428	ldmia sp!,{r4-r12,pc}
429	#else
430	ldmia sp!,{r4-r12,lr}
431	tst lr,#1
432	moveq pc,lr @ be binary compatible with V4, yet
433	bx lr @ interoperable with Thumb ISA:-)
434	#endif
435	___
436
437	{
438	my @Sigma0=(28,34,39);
439	my @Sigma1=(14,18,41);
440	my @sigma0=(1, 8, 7);
441	my @sigma1=(19,61,6);
442
443	my $Ktbl="r3";
444	my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
445
446	my @X=map("d$_",(0..15));
447	my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449	sub NEON_00_15() {
450	my $i=shift;
451	my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452	my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
453
454	$code.=<<___ if ($i<16 \|\| $i&1);
455	vshr.u64 $t0,$e,#@Sigma1[0] @ $i
456	#if $i<16
457	vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
458	#endif
459	vshr.u64 $t1,$e,#@Sigma1[1]
460	vshr.u64 $t2,$e,#@Sigma1[2]
461	___
462	$code.=<<___;
463	vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464	vsli.64 $t0,$e,#`64-@Sigma1[0]`
465	vsli.64 $t1,$e,#`64-@Sigma1[1]`
466	vsli.64 $t2,$e,#`64-@Sigma1[2]`
467	#if $i<16 && defined(__ARMEL__)
468	vrev64.8 @X[$i],@X[$i]
469	#endif
470	vadd.i64 $T1,$K,$h
471	veor $Ch,$f,$g
472	veor $t0,$t1
473	vand $Ch,$e
474	veor $t0,$t2 @ Sigma1(e)
475	veor $Ch,$g @ Ch(e,f,g)
476	vadd.i64 $T1,$t0
477	vshr.u64 $t0,$a,#@Sigma0[0]
478	vadd.i64 $T1,$Ch
479	vshr.u64 $t1,$a,#@Sigma0[1]
480	vshr.u64 $t2,$a,#@Sigma0[2]
481	vsli.64 $t0,$a,#`64-@Sigma0[0]`
482	vsli.64 $t1,$a,#`64-@Sigma0[1]`
483	vsli.64 $t2,$a,#`64-@Sigma0[2]`
484	vadd.i64 $T1,@X[$i%16]
485	vorr $Maj,$a,$c
486	vand $Ch,$a,$c
487	veor $h,$t0,$t1
488	vand $Maj,$b
489	veor $h,$t2 @ Sigma0(a)
490	vorr $Maj,$Ch @ Maj(a,b,c)
491	vadd.i64 $h,$T1
492	vadd.i64 $d,$T1
493	vadd.i64 $h,$Maj
494	___
495	}
496
497	sub NEON_16_79() {
498	my $i=shift;
499
500	if ($i&1) { &NEON_00_15($i,@_); return; }
501
502	# 2x-vectorized, therefore runs every 2nd round
503	my @X=map("q$_",(0..7)); # view @X as 128-bit vector
504	my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
505	my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
506	my $e=@_[4]; # $e from NEON_00_15
507	$i /= 2;
508	$code.=<<___;
509	vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
510	vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
511	vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
512	vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
513	vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
514	vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
515	veor $s1,$t0
516	vshr.u64 $t0,$s0,#@sigma0[0]
517	veor $s1,$t1 @ sigma1(X[i+14])
518	vshr.u64 $t1,$s0,#@sigma0[1]
519	vadd.i64 @X[$i%8],$s1
520	vshr.u64 $s1,$s0,#@sigma0[2]
521	vsli.64 $t0,$s0,#`64-@sigma0[0]`
522	vsli.64 $t1,$s0,#`64-@sigma0[1]`
523	vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
524	veor $s1,$t0
525	vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
526	vadd.i64 @X[$i%8],$s0
527	vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
528	veor $s1,$t1 @ sigma0(X[i+1])
529	vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
530	vadd.i64 @X[$i%8],$s1
531	___
532	&NEON_00_15(2*$i,@_);
533	}
534
535	$code.=<<___;
536	#if __ARM_ARCH__>=7
537	.fpu neon
538
539	.align 4
540	.LNEON:
541	dmb @ errata #451034 on early Cortex A8
542	vstmdb sp!,{d8-d15} @ ABI specification says so
543	sub $Ktbl,r3,#672 @ K512
544	vldmia $ctx,{$A-$H} @ load context
545	.Loop_neon:
546	___
547	for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
548	$code.=<<___;
549	mov $cnt,#4
550	.L16_79_neon:
551	subs $cnt,#1
552	___
553	for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554	$code.=<<___;
555	bne .L16_79_neon
556
557	vldmia $ctx,{d24-d31} @ load context to temp
558	vadd.i64 q8,q12 @ vectorized accumulate
559	vadd.i64 q9,q13
560	vadd.i64 q10,q14
561	vadd.i64 q11,q15
562	vstmia $ctx,{$A-$H} @ save context
563	teq $inp,$len
564	sub $Ktbl,#640 @ rewind K512
565	bne .Loop_neon
566
567	vldmia sp!,{d8-d15} @ epilogue
568	bx lr
569	#endif
570	___
571	}
572	$code.=<<___;
573	.size sha512_block_data_order,.-sha512_block_data_order
574	.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
575	.align 2
576	.comm OPENSSL_armcap_P,4,4
577	___
578
579	$code =~ s/\`([^\`]*)\`/eval $1/gem;
580	$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
581	print $code;
582	close STDOUT; # enforce flush