1 files changed, 429 insertions, 0 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000000..d91586ee29
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+$Xi="r0";       # argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+$Zll="r4";      # variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+$rem_4bit=$inp; # used in gcm_gmult_4bit
+$cnt=$len;
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+        rev     $_,$_
+        str     $_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+        str     $_,[$Xi,#$i]
+#else
+        mov     $Tlh,$_,lsr#8
+        strb    $_,[$Xi,#$i+3]
+        mov     $Thl,$_,lsr#16
+        strb    $Tlh,[$Xi,#$i+2]
+        mov     $Thh,$_,lsr#24
+        strb    $Thl,[$Xi,#$i+1]
+        strb    $Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+$code=<<___;
+#include "arm_arch.h"
+.text
+.code   32
+.type   rem_4bit,%object
+.align  5
+rem_4bit:
+.short  0x0000,0x1C20,0x3840,0x2460
+.short  0x7080,0x6CA0,0x48C0,0x54E0
+.short  0xE100,0xFD20,0xD940,0xC560
+.short  0x9180,0x8DA0,0xA9C0,0xB5E0
+.size   rem_4bit,.-rem_4bit
+.type   rem_4bit_get,%function
+rem_4bit_get:
+        sub     $rem_4bit,pc,#8
+        sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+        b       .Lrem_4bit_got
+        nop
+.size   rem_4bit_get,.-rem_4bit_get
+.global gcm_ghash_4bit
+.type   gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+        sub     r12,pc,#8
+        add     $len,$inp,$len          @ $len to point at the end
+        stmdb   sp!,{r3-r11,lr}         @ save $len/end too
+        sub     r12,r12,#48             @ &rem_4bit
+        ldmia   r12,{r4-r11}            @ copy rem_4bit ...
+        stmdb   sp!,{r4-r11}            @ ... to stack
+        ldrb    $nlo,[$inp,#15]
+        ldrb    $nhi,[$Xi,#15]
+.Louter:
+        eor     $nlo,$nlo,$nhi
+        and     $nhi,$nlo,#0xf0
+        and     $nlo,$nlo,#0x0f
+        mov     $cnt,#14
+        add     $Zhh,$Htbl,$nlo,lsl#4
+        ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
+        add     $Thh,$Htbl,$nhi
+        ldrb    $nlo,[$inp,#14]
+        and     $nhi,$Zll,#0xf          @ rem
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        add     $nhi,$nhi,$nhi
+        eor     $Zll,$Tll,$Zll,lsr#4
+        ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        ldrb    $nhi,[$Xi,#14]
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        eor     $nlo,$nlo,$nhi
+        and     $nhi,$nlo,#0xf0
+        and     $nlo,$nlo,#0x0f
+        eor     $Zhh,$Zhh,$Tll,lsl#16
+.Linner:
+        add     $Thh,$Htbl,$nlo,lsl#4
+        and     $nlo,$Zll,#0xf          @ rem
+        subs    $cnt,$cnt,#1
+        add     $nlo,$nlo,$nlo
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        ldrplb  $nlo,[$inp,$cnt]
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        add     $Thh,$Htbl,$nhi
+        and     $nhi,$Zll,#0xf          @ rem
+        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+        add     $nhi,$nhi,$nhi
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        ldrplb  $Tll,[$Xi,$cnt]
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        ldrh    $Tlh,[sp,$nhi]
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eorpl   $nlo,$nlo,$Tll
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        andpl   $nhi,$nlo,#0xf0
+        andpl   $nlo,$nlo,#0x0f
+        eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
+        bpl     .Linner
+        ldr     $len,[sp,#32]           @ re-load $len/end
+        add     $inp,$inp,#16
+        mov     $nhi,$Zll
+___
+        &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+        bne     .Louter
+        add     sp,sp,#36
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r11,pc}
+#else
+        ldmia   sp!,{r4-r11,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size   gcm_ghash_4bit,.-gcm_ghash_4bit
+.global gcm_gmult_4bit
+.type   gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+        stmdb   sp!,{r4-r11,lr}
+        ldrb    $nlo,[$Xi,#15]
+        b       rem_4bit_get
+.Lrem_4bit_got:
+        and     $nhi,$nlo,#0xf0
+        and     $nlo,$nlo,#0x0f
+        mov     $cnt,#14
+        add     $Zhh,$Htbl,$nlo,lsl#4
+        ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
+        ldrb    $nlo,[$Xi,#14]
+        add     $Thh,$Htbl,$nhi
+        and     $nhi,$Zll,#0xf          @ rem
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        add     $nhi,$nhi,$nhi
+        eor     $Zll,$Tll,$Zll,lsr#4
+        ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        and     $nhi,$nlo,#0xf0
+        eor     $Zhh,$Zhh,$Tll,lsl#16
+        and     $nlo,$nlo,#0x0f
+.Loop:
+        add     $Thh,$Htbl,$nlo,lsl#4
+        and     $nlo,$Zll,#0xf          @ rem
+        subs    $cnt,$cnt,#1
+        add     $nlo,$nlo,$nlo
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        ldrplb  $nlo,[$Xi,$cnt]
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        add     $Thh,$Htbl,$nhi
+        and     $nhi,$Zll,#0xf          @ rem
+        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+        add     $nhi,$nhi,$nhi
+        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+        eor     $Zll,$Tll,$Zll,lsr#4
+        eor     $Zll,$Zll,$Zlh,lsl#28
+        eor     $Zlh,$Tlh,$Zlh,lsr#4
+        ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
+        eor     $Zlh,$Zlh,$Zhl,lsl#28
+        eor     $Zhl,$Thl,$Zhl,lsr#4
+        eor     $Zhl,$Zhl,$Zhh,lsl#28
+        eor     $Zhh,$Thh,$Zhh,lsr#4
+        andpl   $nhi,$nlo,#0xf0
+        andpl   $nlo,$nlo,#0x0f
+        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
+        bpl     .Loop
+___
+        &Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+        ldmia   sp!,{r4-r11,pc}
+#else
+        ldmia   sp!,{r4-r11,lr}
+        tst     lr,#1
+        moveq   pc,lr                   @ be binary compatible with V4, yet
+        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+.size   gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl;  # $Htbl is used once in the very beginning
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu    neon
+.global gcm_gmult_neon
+.type   gcm_gmult_neon,%function
+.align  4
+gcm_gmult_neon:
+        sub             $Htbl,#16               @ point at H in GCM128_CTX
+        vld1.64         `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+        vmov.i32        $mod,#0xe1              @ our irreducible polynomial
+        vld1.64         `&Dlo("$IN")`,[$Xi,:64]!
+        vshr.u64        $mod,#32
+        vldmia          $Htbl,{$Hhi-$Hlo}       @ load H
+        veor            $zero,$zero
+#ifdef __ARMEL__
+        vrev64.8        $IN,$IN
+#endif
+        veor            $Qpost,$Qpost
+        veor            $R,$R
+        mov             $cnt,#16
+        veor            $Z,$Z
+        mov             $len,#16
+        veor            $Zo,$Zo
+        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+        b               .Linner_neon
+.size   gcm_gmult_neon,.-gcm_gmult_neon
+.global gcm_ghash_neon
+.type   gcm_ghash_neon,%function
+.align  4
+gcm_ghash_neon:
+        vld1.64         `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
+        vmov.i32        $mod,#0xe1              @ our irreducible polynomial
+        vld1.64         `&Dlo("$Z")`,[$Xi,:64]!
+        vshr.u64        $mod,#32
+        vldmia          $Xi,{$Hhi-$Hlo}         @ load H
+        veor            $zero,$zero
+        nop
+#ifdef __ARMEL__
+        vrev64.8        $Z,$Z
+#endif
+.Louter_neon:
+        vld1.64         `&Dhi($IN)`,[$inp]!     @ load inp
+        veor            $Qpost,$Qpost
+        vld1.64         `&Dlo($IN)`,[$inp]!
+        veor            $R,$R
+        mov             $cnt,#16
+#ifdef __ARMEL__
+        vrev64.8        $IN,$IN
+#endif
+        veor            $Zo,$Zo
+        veor            $IN,$Z                  @ inp^=Xi
+        veor            $Z,$Z
+        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+.Linner_neon:
+        subs            $cnt,$cnt,#1
+        vmull.p8        $Qlo,$Hlo,$xi           @ H.lo�Xi[i]
+        vmull.p8        $Qhi,$Hhi,$xi           @ H.hi�Xi[i]
+        vext.8          $IN,$zero,#1            @ IN>>=8
+        veor            $Z,$Qpost               @ modulo-scheduled part
+        vshl.i64        `&Dlo("$R")`,#48
+        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
+        veor            $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+        veor            `&Dhi("$Z")`,`&Dlo("$R")`
+        vuzp.8          $Qlo,$Qhi
+        vsli.8          $Zo,$T,#1               @ compose the "carry" byte
+        vext.8          $Z,$zero,#1             @ Z>>=8
+        vmull.p8        $R,$Zo,$mod             @ "carry"�0xe1
+        vshr.u8         $Zo,$T,#7               @ save Z's bottom bit
+        vext.8          $Qpost,$Qlo,$zero,#1    @ Qlo>>=8
+        veor            $Z,$Qhi
+        bne             .Linner_neon
+        veor            $Z,$Qpost               @ modulo-scheduled artefact
+        vshl.i64        `&Dlo("$R")`,#48
+        veor            `&Dhi("$Z")`,`&Dlo("$R")`
+        @ finalization, normalize Z:Zo
+        vand            $Zo,$mod                @ suffices to mask the bit
+        vshr.u64        `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+        vshl.i64        $Z,#1
+        subs            $len,#16
+        vorr            $Z,`&Q("$Zo")`          @ Z=Z:Zo<<1
+        bne             .Louter_neon
+#ifdef __ARMEL__
+        vrev64.8        $Z,$Z
+#endif
+        sub             $Xi,#16 
+        vst1.64         `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
+        vst1.64         `&Dlo("$Z")`,[$Xi,:64]
+        bx      lr
+.size   gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush

diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl new file mode 100644 index 0000000000..d91586ee29 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
	1	#!/usr/bin/env perl
	2	#
	3	# ====================================================================
	4	# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
	5	# project. The module is, however, dual licensed under OpenSSL and
	6	# CRYPTOGAMS licenses depending on where you obtain it. For further
	7	# details see http://www.openssl.org/~appro/cryptogams/.
	8	# ====================================================================
	9	#
	10	# April 2010
	11	#
	12	# The module implements "4-bit" GCM GHASH function and underlying
	13	# single multiplication operation in GF(2^128). "4-bit" means that it
	14	# uses 256 bytes per-key table [+32 bytes shared table]. There is no
	15	# experimental performance data available yet. The only approximation
	16	# that can be made at this point is based on code size. Inner loop is
	17	# 32 instructions long and on single-issue core should execute in <40
	18	# cycles. Having verified that gcc 3.4 didn't unroll corresponding
	19	# loop, this assembler loop body was found to be ~3x smaller than
	20	# compiler-generated one...
	21	#
	22	# July 2010
	23	#
	24	# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
	25	# Cortex A8 core and ~25 cycles per processed byte (which was observed
	26	# to be ~3 times faster than gcc-generated code:-)
	27	#
	28	# February 2011
	29	#
	30	# Profiler-assisted and platform-specific optimization resulted in 7%
	31	# improvement on Cortex A8 core and ~23.5 cycles per byte.
	32	#
	33	# March 2011
	34	#
	35	# Add NEON implementation featuring polynomial multiplication, i.e. no
	36	# lookup tables involved. On Cortex A8 it was measured to process one
	37	# byte in 15 cycles or 55% faster than integer-only code.
	38
	39	# ====================================================================
	40	# Note about "528B" variant. In ARM case it makes lesser sense to
	41	# implement it for following reasons:
	42	#
	43	# - performance improvement won't be anywhere near 50%, because 128-
	44	# bit shift operation is neatly fused with 128-bit xor here, and
	45	# "538B" variant would eliminate only 4-5 instructions out of 32
	46	# in the inner loop (meaning that estimated improvement is ~15%);
	47	# - ARM-based systems are often embedded ones and extra memory
	48	# consumption might be unappreciated (for so little improvement);
	49	#
	50	# Byte order [in]dependence. =========================================
	51	#
	52	# Caller is expected to maintain specific dword order in Htable,
	53	# namely with least significant dword of 128-bit value at lower
	54	# address. This differs completely from C code and has everything to
	55	# do with ldm instruction and order in which dwords are "consumed" by
	56	# algorithm. Byte order within these dwords in turn is whatever
	57	# native byte order on current platform. See gcm128.c for working
	58	# example...
	59
	60	while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
	61	open STDOUT,">$output";
	62
	63	$Xi="r0"; # argument block
	64	$Htbl="r1";
	65	$inp="r2";
	66	$len="r3";
	67
	68	$Zll="r4"; # variables
	69	$Zlh="r5";
	70	$Zhl="r6";
	71	$Zhh="r7";
	72	$Tll="r8";
	73	$Tlh="r9";
	74	$Thl="r10";
	75	$Thh="r11";
	76	$nlo="r12";
	77	################# r13 is stack pointer
	78	$nhi="r14";
	79	################# r15 is program counter
	80
	81	$rem_4bit=$inp; # used in gcm_gmult_4bit
	82	$cnt=$len;
	83
	84	sub Zsmash() {
	85	my $i=12;
	86	my @args=@_;
	87	for ($Zll,$Zlh,$Zhl,$Zhh) {
	88	$code.=<<___;
	89	#if __ARM_ARCH__>=7 && defined(__ARMEL__)
	90	rev $_,$_
	91	str $_,[$Xi,#$i]
	92	#elif defined(__ARMEB__)
	93	str $_,[$Xi,#$i]
	94	#else
	95	mov $Tlh,$_,lsr#8
	96	strb $_,[$Xi,#$i+3]
	97	mov $Thl,$_,lsr#16
	98	strb $Tlh,[$Xi,#$i+2]
	99	mov $Thh,$_,lsr#24
	100	strb $Thl,[$Xi,#$i+1]
	101	strb $Thh,[$Xi,#$i]
	102	#endif
	103	___
	104	$code.="\t".shift(@args)."\n";
	105	$i-=4;
	106	}
	107	}
	108
	109	$code=<<___;
	110	#include "arm_arch.h"
	111
	112	.text
	113	.code 32
	114
	115	.type rem_4bit,%object
	116	.align 5
	117	rem_4bit:
	118	.short 0x0000,0x1C20,0x3840,0x2460
	119	.short 0x7080,0x6CA0,0x48C0,0x54E0
	120	.short 0xE100,0xFD20,0xD940,0xC560
	121	.short 0x9180,0x8DA0,0xA9C0,0xB5E0
	122	.size rem_4bit,.-rem_4bit
	123
	124	.type rem_4bit_get,%function
	125	rem_4bit_get:
	126	sub $rem_4bit,pc,#8
	127	sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit
	128	b .Lrem_4bit_got
	129	nop
	130	.size rem_4bit_get,.-rem_4bit_get
	131
	132	.global gcm_ghash_4bit
	133	.type gcm_ghash_4bit,%function
	134	gcm_ghash_4bit:
	135	sub r12,pc,#8
	136	add $len,$inp,$len @ $len to point at the end
	137	stmdb sp!,{r3-r11,lr} @ save $len/end too
	138	sub r12,r12,#48 @ &rem_4bit
	139
	140	ldmia r12,{r4-r11} @ copy rem_4bit ...
	141	stmdb sp!,{r4-r11} @ ... to stack
	142
	143	ldrb $nlo,[$inp,#15]
	144	ldrb $nhi,[$Xi,#15]
	145	.Louter:
	146	eor $nlo,$nlo,$nhi
	147	and $nhi,$nlo,#0xf0
	148	and $nlo,$nlo,#0x0f
	149	mov $cnt,#14
	150
	151	add $Zhh,$Htbl,$nlo,lsl#4
	152	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
	153	add $Thh,$Htbl,$nhi
	154	ldrb $nlo,[$inp,#14]
	155
	156	and $nhi,$Zll,#0xf @ rem
	157	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
	158	add $nhi,$nhi,$nhi
	159	eor $Zll,$Tll,$Zll,lsr#4
	160	ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
	161	eor $Zll,$Zll,$Zlh,lsl#28
	162	ldrb $nhi,[$Xi,#14]
	163	eor $Zlh,$Tlh,$Zlh,lsr#4
	164	eor $Zlh,$Zlh,$Zhl,lsl#28
	165	eor $Zhl,$Thl,$Zhl,lsr#4
	166	eor $Zhl,$Zhl,$Zhh,lsl#28
	167	eor $Zhh,$Thh,$Zhh,lsr#4
	168	eor $nlo,$nlo,$nhi
	169	and $nhi,$nlo,#0xf0
	170	and $nlo,$nlo,#0x0f
	171	eor $Zhh,$Zhh,$Tll,lsl#16
	172
	173	.Linner:
	174	add $Thh,$Htbl,$nlo,lsl#4
	175	and $nlo,$Zll,#0xf @ rem
	176	subs $cnt,$cnt,#1
	177	add $nlo,$nlo,$nlo
	178	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
	179	eor $Zll,$Tll,$Zll,lsr#4
	180	eor $Zll,$Zll,$Zlh,lsl#28
	181	eor $Zlh,$Tlh,$Zlh,lsr#4
	182	eor $Zlh,$Zlh,$Zhl,lsl#28
	183	ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
	184	eor $Zhl,$Thl,$Zhl,lsr#4
	185	ldrplb $nlo,[$inp,$cnt]
	186	eor $Zhl,$Zhl,$Zhh,lsl#28
	187	eor $Zhh,$Thh,$Zhh,lsr#4
	188
	189	add $Thh,$Htbl,$nhi
	190	and $nhi,$Zll,#0xf @ rem
	191	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
	192	add $nhi,$nhi,$nhi
	193	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
	194	eor $Zll,$Tll,$Zll,lsr#4
	195	ldrplb $Tll,[$Xi,$cnt]
	196	eor $Zll,$Zll,$Zlh,lsl#28
	197	eor $Zlh,$Tlh,$Zlh,lsr#4
	198	ldrh $Tlh,[sp,$nhi]
	199	eor $Zlh,$Zlh,$Zhl,lsl#28
	200	eor $Zhl,$Thl,$Zhl,lsr#4
	201	eor $Zhl,$Zhl,$Zhh,lsl#28
	202	eorpl $nlo,$nlo,$Tll
	203	eor $Zhh,$Thh,$Zhh,lsr#4
	204	andpl $nhi,$nlo,#0xf0
	205	andpl $nlo,$nlo,#0x0f
	206	eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
	207	bpl .Linner
	208
	209	ldr $len,[sp,#32] @ re-load $len/end
	210	add $inp,$inp,#16
	211	mov $nhi,$Zll
	212	___
	213	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
	214	$code.=<<___;
	215	bne .Louter
	216
	217	add sp,sp,#36
	218	#if __ARM_ARCH__>=5
	219	ldmia sp!,{r4-r11,pc}
	220	#else
	221	ldmia sp!,{r4-r11,lr}
	222	tst lr,#1
	223	moveq pc,lr @ be binary compatible with V4, yet
	224	bx lr @ interoperable with Thumb ISA:-)
	225	#endif
	226	.size gcm_ghash_4bit,.-gcm_ghash_4bit
	227
	228	.global gcm_gmult_4bit
	229	.type gcm_gmult_4bit,%function
	230	gcm_gmult_4bit:
	231	stmdb sp!,{r4-r11,lr}
	232	ldrb $nlo,[$Xi,#15]
	233	b rem_4bit_get
	234	.Lrem_4bit_got:
	235	and $nhi,$nlo,#0xf0
	236	and $nlo,$nlo,#0x0f
	237	mov $cnt,#14
	238
	239	add $Zhh,$Htbl,$nlo,lsl#4
	240	ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo]
	241	ldrb $nlo,[$Xi,#14]
	242
	243	add $Thh,$Htbl,$nhi
	244	and $nhi,$Zll,#0xf @ rem
	245	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
	246	add $nhi,$nhi,$nhi
	247	eor $Zll,$Tll,$Zll,lsr#4
	248	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
	249	eor $Zll,$Zll,$Zlh,lsl#28
	250	eor $Zlh,$Tlh,$Zlh,lsr#4
	251	eor $Zlh,$Zlh,$Zhl,lsl#28
	252	eor $Zhl,$Thl,$Zhl,lsr#4
	253	eor $Zhl,$Zhl,$Zhh,lsl#28
	254	eor $Zhh,$Thh,$Zhh,lsr#4
	255	and $nhi,$nlo,#0xf0
	256	eor $Zhh,$Zhh,$Tll,lsl#16
	257	and $nlo,$nlo,#0x0f
	258
	259	.Loop:
	260	add $Thh,$Htbl,$nlo,lsl#4
	261	and $nlo,$Zll,#0xf @ rem
	262	subs $cnt,$cnt,#1
	263	add $nlo,$nlo,$nlo
	264	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
	265	eor $Zll,$Tll,$Zll,lsr#4
	266	eor $Zll,$Zll,$Zlh,lsl#28
	267	eor $Zlh,$Tlh,$Zlh,lsr#4
	268	eor $Zlh,$Zlh,$Zhl,lsl#28
	269	ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
	270	eor $Zhl,$Thl,$Zhl,lsr#4
	271	ldrplb $nlo,[$Xi,$cnt]
	272	eor $Zhl,$Zhl,$Zhh,lsl#28
	273	eor $Zhh,$Thh,$Zhh,lsr#4
	274
	275	add $Thh,$Htbl,$nhi
	276	and $nhi,$Zll,#0xf @ rem
	277	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
	278	add $nhi,$nhi,$nhi
	279	ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
	280	eor $Zll,$Tll,$Zll,lsr#4
	281	eor $Zll,$Zll,$Zlh,lsl#28
	282	eor $Zlh,$Tlh,$Zlh,lsr#4
	283	ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
	284	eor $Zlh,$Zlh,$Zhl,lsl#28
	285	eor $Zhl,$Thl,$Zhl,lsr#4
	286	eor $Zhl,$Zhl,$Zhh,lsl#28
	287	eor $Zhh,$Thh,$Zhh,lsr#4
	288	andpl $nhi,$nlo,#0xf0
	289	andpl $nlo,$nlo,#0x0f
	290	eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
	291	bpl .Loop
	292	___
	293	&Zsmash();
	294	$code.=<<___;
	295	#if __ARM_ARCH__>=5
	296	ldmia sp!,{r4-r11,pc}
	297	#else
	298	ldmia sp!,{r4-r11,lr}
	299	tst lr,#1
	300	moveq pc,lr @ be binary compatible with V4, yet
	301	bx lr @ interoperable with Thumb ISA:-)
	302	#endif
	303	.size gcm_gmult_4bit,.-gcm_gmult_4bit
	304	___
	305	{
	306	my $cnt=$Htbl; # $Htbl is used once in the very beginning
	307
	308	my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
	309	my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
	310
	311	# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
	312	# in Zo. Or should I say "top bit", because GHASH is specified in
	313	# reverse bit order? Otherwise straightforward 128-bt H by one input
	314	# byte multiplication and modulo-reduction, times 16.
	315
	316	sub Dlo() { shift=~m\|q([1]?[0-9])\|?"d".($1*2):""; }
	317	sub Dhi() { shift=~m\|q([1]?[0-9])\|?"d".($1*2+1):""; }
	318	sub Q() { shift=~m\|d([1-3]?[02468])\|?"q".($1/2):""; }
	319
	320	$code.=<<___;
	321	#if __ARM_ARCH__>=7
	322	.fpu neon
	323
	324	.global gcm_gmult_neon
	325	.type gcm_gmult_neon,%function
	326	.align 4
	327	gcm_gmult_neon:
	328	sub $Htbl,#16 @ point at H in GCM128_CTX
	329	vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
	330	vmov.i32 $mod,#0xe1 @ our irreducible polynomial
	331	vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
	332	vshr.u64 $mod,#32
	333	vldmia $Htbl,{$Hhi-$Hlo} @ load H
	334	veor $zero,$zero
	335	#ifdef __ARMEL__
	336	vrev64.8 $IN,$IN
	337	#endif
	338	veor $Qpost,$Qpost
	339	veor $R,$R
	340	mov $cnt,#16
	341	veor $Z,$Z
	342	mov $len,#16
	343	veor $Zo,$Zo
	344	vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
	345	b .Linner_neon
	346	.size gcm_gmult_neon,.-gcm_gmult_neon
	347
	348	.global gcm_ghash_neon
	349	.type gcm_ghash_neon,%function
	350	.align 4
	351	gcm_ghash_neon:
	352	vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
	353	vmov.i32 $mod,#0xe1 @ our irreducible polynomial
	354	vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
	355	vshr.u64 $mod,#32
	356	vldmia $Xi,{$Hhi-$Hlo} @ load H
	357	veor $zero,$zero
	358	nop
	359	#ifdef __ARMEL__
	360	vrev64.8 $Z,$Z
	361	#endif
	362	.Louter_neon:
	363	vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
	364	veor $Qpost,$Qpost
	365	vld1.64 `&Dlo($IN)`,[$inp]!
	366	veor $R,$R
	367	mov $cnt,#16
	368	#ifdef __ARMEL__
	369	vrev64.8 $IN,$IN
	370	#endif
	371	veor $Zo,$Zo
	372	veor $IN,$Z @ inp^=Xi
	373	veor $Z,$Z
	374	vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
	375	.Linner_neon:
	376	subs $cnt,$cnt,#1
	377	vmull.p8 $Qlo,$Hlo,$xi @ H.lo�Xi[i]
	378	vmull.p8 $Qhi,$Hhi,$xi @ H.hi�Xi[i]
	379	vext.8 $IN,$zero,#1 @ IN>>=8
	380
	381	veor $Z,$Qpost @ modulo-scheduled part
	382	vshl.i64 `&Dlo("$R")`,#48
	383	vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
	384	veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
	385
	386	veor `&Dhi("$Z")`,`&Dlo("$R")`
	387	vuzp.8 $Qlo,$Qhi
	388	vsli.8 $Zo,$T,#1 @ compose the "carry" byte
	389	vext.8 $Z,$zero,#1 @ Z>>=8
	390
	391	vmull.p8 $R,$Zo,$mod @ "carry"�0xe1
	392	vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
	393	vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
	394	veor $Z,$Qhi
	395	bne .Linner_neon
	396
	397	veor $Z,$Qpost @ modulo-scheduled artefact
	398	vshl.i64 `&Dlo("$R")`,#48
	399	veor `&Dhi("$Z")`,`&Dlo("$R")`
	400
	401	@ finalization, normalize Z:Zo
	402	vand $Zo,$mod @ suffices to mask the bit
	403	vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
	404	vshl.i64 $Z,#1
	405	subs $len,#16
	406	vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
	407	bne .Louter_neon
	408
	409	#ifdef __ARMEL__
	410	vrev64.8 $Z,$Z
	411	#endif
	412	sub $Xi,#16
	413	vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
	414	vst1.64 `&Dlo("$Z")`,[$Xi,:64]
	415
	416	bx lr
	417	.size gcm_ghash_neon,.-gcm_ghash_neon
	418	#endif
	419	___
	420	}
	421	$code.=<<___;
	422	.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
	423	.align 2
	424	___
	425
	426	$code =~ s/\`([^\`]*)\`/eval $1/gem;
	427	$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
	428	print $code;
	429	close STDOUT; # enforce flush