18 files changed, 0 insertions, 8313 deletions
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index b6d6ea5a62..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,455 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Even though
-# loops are aggressively modulo-scheduled in respect to references to
-# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
-# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
-# scheduling "glitch," because uprofile(1) indicates uniform sample
-# distribution, as if all instruction bundles execute in 1.5 cycles.
-# Meaning that it could have been even faster, yet 12 cycles is ~60%
-# better than gcc-generated code and ~80% than code generated by vendor
-# compiler.
-$cnt="v0";      # $0
-$t0="t0";
-$t1="t1";
-$t2="t2";
-$Thi0="t3";     # $4
-$Tlo0="t4";
-$Thi1="t5";
-$Tlo1="t6";
-$rem="t7";      # $8
-#################
-$Xi="a0";       # $16, input argument block
-$Htbl="a1";
-$inp="a2";
-$len="a3";
-$nlo="a4";      # $20
-$nhi="a5";
-$Zhi="t8";
-$Zlo="t9";
-$Xhi="t10";     # $24
-$Xlo="t11";
-$remp="t12";
-$rem_4bit="AT"; # $28
-{ my $N;
-  sub loop() {
-        $N++;
-$code.=<<___;
-.align  4
-        extbl   $Xlo,7,$nlo
-        and     $nlo,0xf0,$nhi
-        sll     $nlo,4,$nlo
-        and     $nlo,0xf0,$nlo
-        addq    $nlo,$Htbl,$nlo
-        ldq     $Zlo,8($nlo)
-        addq    $nhi,$Htbl,$nhi
-        ldq     $Zhi,0($nlo)
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        lda     $cnt,6(zero)
-        extbl   $Xlo,6,$nlo
-        ldq     $Tlo1,8($nhi)
-        s8addq  $remp,$rem_4bit,$remp
-        ldq     $Thi1,0($nhi)
-        srl     $Zlo,4,$Zlo
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        xor     $t0,$Zlo,$Zlo
-        and     $nlo,0xf0,$nhi
-        xor     $Tlo1,$Zlo,$Zlo
-        sll     $nlo,4,$nlo
-        xor     $Thi1,$Zhi,$Zhi
-        and     $nlo,0xf0,$nlo
-        addq    $nlo,$Htbl,$nlo
-        ldq     $Tlo0,8($nlo)
-        addq    $nhi,$Htbl,$nhi
-        ldq     $Thi0,0($nlo)
-.Looplo$N:
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        subq    $cnt,1,$cnt
-        srl     $Zlo,4,$Zlo
-        ldq     $Tlo1,8($nhi)
-        xor     $rem,$Zhi,$Zhi
-        ldq     $Thi1,0($nhi)
-        s8addq  $remp,$rem_4bit,$remp
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        xor     $t0,$Zlo,$Zlo
-        extbl   $Xlo,$cnt,$nlo
-        and     $nlo,0xf0,$nhi
-        xor     $Thi0,$Zhi,$Zhi
-        xor     $Tlo0,$Zlo,$Zlo
-        sll     $nlo,4,$nlo
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        and     $nlo,0xf0,$nlo
-        srl     $Zlo,4,$Zlo
-        s8addq  $remp,$rem_4bit,$remp
-        xor     $rem,$Zhi,$Zhi
-        addq    $nlo,$Htbl,$nlo
-        addq    $nhi,$Htbl,$nhi
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        ldq     $Tlo0,8($nlo)
-        xor     $t0,$Zlo,$Zlo
-        xor     $Tlo1,$Zlo,$Zlo
-        xor     $Thi1,$Zhi,$Zhi
-        ldq     $Thi0,0($nlo)
-        bne     $cnt,.Looplo$N
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        lda     $cnt,7(zero)
-        srl     $Zlo,4,$Zlo
-        ldq     $Tlo1,8($nhi)
-        xor     $rem,$Zhi,$Zhi
-        ldq     $Thi1,0($nhi)
-        s8addq  $remp,$rem_4bit,$remp
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        xor     $t0,$Zlo,$Zlo
-        extbl   $Xhi,$cnt,$nlo
-        and     $nlo,0xf0,$nhi
-        xor     $Thi0,$Zhi,$Zhi
-        xor     $Tlo0,$Zlo,$Zlo
-        sll     $nlo,4,$nlo
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        and     $nlo,0xf0,$nlo
-        srl     $Zlo,4,$Zlo
-        s8addq  $remp,$rem_4bit,$remp
-        xor     $rem,$Zhi,$Zhi
-        addq    $nlo,$Htbl,$nlo
-        addq    $nhi,$Htbl,$nhi
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        ldq     $Tlo0,8($nlo)
-        xor     $t0,$Zlo,$Zlo
-        xor     $Tlo1,$Zlo,$Zlo
-        xor     $Thi1,$Zhi,$Zhi
-        ldq     $Thi0,0($nlo)
-        unop
-.Loophi$N:
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        subq    $cnt,1,$cnt
-        srl     $Zlo,4,$Zlo
-        ldq     $Tlo1,8($nhi)
-        xor     $rem,$Zhi,$Zhi
-        ldq     $Thi1,0($nhi)
-        s8addq  $remp,$rem_4bit,$remp
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        xor     $t0,$Zlo,$Zlo
-        extbl   $Xhi,$cnt,$nlo
-        and     $nlo,0xf0,$nhi
-        xor     $Thi0,$Zhi,$Zhi
-        xor     $Tlo0,$Zlo,$Zlo
-        sll     $nlo,4,$nlo
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        and     $nlo,0xf0,$nlo
-        srl     $Zlo,4,$Zlo
-        s8addq  $remp,$rem_4bit,$remp
-        xor     $rem,$Zhi,$Zhi
-        addq    $nlo,$Htbl,$nlo
-        addq    $nhi,$Htbl,$nhi
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        ldq     $Tlo0,8($nlo)
-        xor     $t0,$Zlo,$Zlo
-        xor     $Tlo1,$Zlo,$Zlo
-        xor     $Thi1,$Zhi,$Zhi
-        ldq     $Thi0,0($nlo)
-        bne     $cnt,.Loophi$N
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        srl     $Zlo,4,$Zlo
-        ldq     $Tlo1,8($nhi)
-        xor     $rem,$Zhi,$Zhi
-        ldq     $Thi1,0($nhi)
-        s8addq  $remp,$rem_4bit,$remp
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        xor     $t0,$Zlo,$Zlo
-        xor     $Tlo0,$Zlo,$Zlo
-        xor     $Thi0,$Zhi,$Zhi
-        and     $Zlo,0x0f,$remp
-        sll     $Zhi,60,$t0
-        srl     $Zlo,4,$Zlo
-        s8addq  $remp,$rem_4bit,$remp
-        xor     $rem,$Zhi,$Zhi
-        ldq     $rem,0($remp)
-        srl     $Zhi,4,$Zhi
-        xor     $Tlo1,$Zlo,$Zlo
-        xor     $Thi1,$Zhi,$Zhi
-        xor     $t0,$Zlo,$Zlo
-        xor     $rem,$Zhi,$Zhi
-___
-}}
-$code=<<___;
-#include <machine/asm.h>
-.text
-.set    noat
-.set    noreorder
-.globl  gcm_gmult_4bit
-.align  4
-.ent    gcm_gmult_4bit
-gcm_gmult_4bit:
-        .frame  sp,0,ra
-        .prologue 0
-        ldq     $Xlo,8($Xi)
-        ldq     $Xhi,0($Xi)
-        bsr     $t0,picmeup
-        nop
-___
-        &loop();
-$code.=<<___;
-        srl     $Zlo,24,$t0     # byte swap
-        srl     $Zlo,8,$t1
-        sll     $Zlo,8,$t2
-        sll     $Zlo,24,$Zlo
-        zapnot  $t0,0x11,$t0
-        zapnot  $t1,0x22,$t1
-        zapnot  $Zlo,0x88,$Zlo
-        or      $t0,$t1,$t0
-        zapnot  $t2,0x44,$t2
-        or      $Zlo,$t0,$Zlo
-        srl     $Zhi,24,$t0
-        srl     $Zhi,8,$t1
-        or      $Zlo,$t2,$Zlo
-        sll     $Zhi,8,$t2
-        sll     $Zhi,24,$Zhi
-        srl     $Zlo,32,$Xlo
-        sll     $Zlo,32,$Zlo
-        zapnot  $t0,0x11,$t0
-        zapnot  $t1,0x22,$t1
-        or      $Zlo,$Xlo,$Xlo
-        zapnot  $Zhi,0x88,$Zhi
-        or      $t0,$t1,$t0
-        zapnot  $t2,0x44,$t2
-        or      $Zhi,$t0,$Zhi
-        or      $Zhi,$t2,$Zhi
-        srl     $Zhi,32,$Xhi
-        sll     $Zhi,32,$Zhi
-        or      $Zhi,$Xhi,$Xhi
-        stq     $Xlo,8($Xi)
-        stq     $Xhi,0($Xi)
-        ret     (ra)
-.end    gcm_gmult_4bit
-___
-$inhi="s0";
-$inlo="s1";
-$code.=<<___;
-.globl  gcm_ghash_4bit
-.align  4
-.ent    gcm_ghash_4bit
-gcm_ghash_4bit:
-        lda     sp,-32(sp)
-        stq     ra,0(sp)
-        stq     s0,8(sp)
-        stq     s1,16(sp)
-        .mask   0x04000600,-32
-        .frame  sp,32,ra
-        .prologue 0
-        ldq_u   $inhi,0($inp)
-        ldq_u   $Thi0,7($inp)
-        ldq_u   $inlo,8($inp)
-        ldq_u   $Tlo0,15($inp)
-        ldq     $Xhi,0($Xi)
-        ldq     $Xlo,8($Xi)
-        bsr     $t0,picmeup
-        nop
-.Louter:
-        extql   $inhi,$inp,$inhi
-        extqh   $Thi0,$inp,$Thi0
-        or      $inhi,$Thi0,$inhi
-        lda     $inp,16($inp)
-        extql   $inlo,$inp,$inlo
-        extqh   $Tlo0,$inp,$Tlo0
-        or      $inlo,$Tlo0,$inlo
-        subq    $len,16,$len
-        xor     $Xlo,$inlo,$Xlo
-        xor     $Xhi,$inhi,$Xhi
-___
-        &loop();
-$code.=<<___;
-        srl     $Zlo,24,$t0     # byte swap
-        srl     $Zlo,8,$t1
-        sll     $Zlo,8,$t2
-        sll     $Zlo,24,$Zlo
-        zapnot  $t0,0x11,$t0
-        zapnot  $t1,0x22,$t1
-        zapnot  $Zlo,0x88,$Zlo
-        or      $t0,$t1,$t0
-        zapnot  $t2,0x44,$t2
-        or      $Zlo,$t0,$Zlo
-        srl     $Zhi,24,$t0
-        srl     $Zhi,8,$t1
-        or      $Zlo,$t2,$Zlo
-        sll     $Zhi,8,$t2
-        sll     $Zhi,24,$Zhi
-        srl     $Zlo,32,$Xlo
-        sll     $Zlo,32,$Zlo
-        beq     $len,.Ldone
-        zapnot  $t0,0x11,$t0
-        zapnot  $t1,0x22,$t1
-        or      $Zlo,$Xlo,$Xlo
-        ldq_u   $inhi,0($inp)
-        zapnot  $Zhi,0x88,$Zhi
-        or      $t0,$t1,$t0
-        zapnot  $t2,0x44,$t2
-        ldq_u   $Thi0,7($inp)
-        or      $Zhi,$t0,$Zhi
-        or      $Zhi,$t2,$Zhi
-        ldq_u   $inlo,8($inp)
-        ldq_u   $Tlo0,15($inp)
-        srl     $Zhi,32,$Xhi
-        sll     $Zhi,32,$Zhi
-        or      $Zhi,$Xhi,$Xhi
-        br      zero,.Louter
-.Ldone:
-        zapnot  $t0,0x11,$t0
-        zapnot  $t1,0x22,$t1
-        or      $Zlo,$Xlo,$Xlo
-        zapnot  $Zhi,0x88,$Zhi
-        or      $t0,$t1,$t0
-        zapnot  $t2,0x44,$t2
-        or      $Zhi,$t0,$Zhi
-        or      $Zhi,$t2,$Zhi
-        srl     $Zhi,32,$Xhi
-        sll     $Zhi,32,$Zhi
-        or      $Zhi,$Xhi,$Xhi
-        stq     $Xlo,8($Xi)
-        stq     $Xhi,0($Xi)
-        .set    noreorder
-        /*ldq   ra,0(sp)*/
-        ldq     s0,8(sp)
-        ldq     s1,16(sp)
-        lda     sp,32(sp)
-        ret     (ra)
-.end    gcm_ghash_4bit
-.align  4
-.ent    picmeup
-picmeup:
-        .frame  sp,0,$t0
-        .prologue 0
-        br      $rem_4bit,.Lpic
-.Lpic:  lda     $rem_4bit,12($rem_4bit)
-        ret     ($t0)
-.end    picmeup
-        nop
-rem_4bit:
-        .long   0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
-        .long   0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
-        .long   0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
-        .long   0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
-.ascii  "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
-.align  4
-___
-$output=shift and open STDOUT,">$output";
-print $code;
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
deleted file mode 100644
index d91586ee29..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-armv4.pl
+++ /dev/null
@@ -1,429 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# April 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+32 bytes shared table]. There is no
-# experimental performance data available yet. The only approximation
-# that can be made at this point is based on code size. Inner loop is
-# 32 instructions long and on single-issue core should execute in <40
-# cycles. Having verified that gcc 3.4 didn't unroll corresponding
-# loop, this assembler loop body was found to be ~3x smaller than
-# compiler-generated one...
-#
-# July 2010
-#
-# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
-# Cortex A8 core and ~25 cycles per processed byte (which was observed
-# to be ~3 times faster than gcc-generated code:-)
-#
-# February 2011
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Cortex A8 core and ~23.5 cycles per byte.
-#
-# March 2011
-#
-# Add NEON implementation featuring polynomial multiplication, i.e. no
-# lookup tables involved. On Cortex A8 it was measured to process one
-# byte in 15 cycles or 55% faster than integer-only code.
-# ====================================================================
-# Note about "528B" variant. In ARM case it makes lesser sense to
-# implement it for following reasons:
-#
-# - performance improvement won't be anywhere near 50%, because 128-
-#   bit shift operation is neatly fused with 128-bit xor here, and
-#   "538B" variant would eliminate only 4-5 instructions out of 32
-#   in the inner loop (meaning that estimated improvement is ~15%);
-# - ARM-based systems are often embedded ones and extra memory
-#   consumption might be unappreciated (for so little improvement);
-#
-# Byte order [in]dependence. =========================================
-#
-# Caller is expected to maintain specific *dword* order in Htable,
-# namely with *least* significant dword of 128-bit value at *lower*
-# address. This differs completely from C code and has everything to
-# do with ldm instruction and order in which dwords are "consumed" by
-# algorithm. *Byte* order within these dwords in turn is whatever
-# *native* byte order on current platform. See gcm128.c for working
-# example...
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-$Xi="r0";       # argument block
-$Htbl="r1";
-$inp="r2";
-$len="r3";
-$Zll="r4";      # variables
-$Zlh="r5";
-$Zhl="r6";
-$Zhh="r7";
-$Tll="r8";
-$Tlh="r9";
-$Thl="r10";
-$Thh="r11";
-$nlo="r12";
-################# r13 is stack pointer
-$nhi="r14";
-################# r15 is program counter
-$rem_4bit=$inp; # used in gcm_gmult_4bit
-$cnt=$len;
-sub Zsmash() {
-  my $i=12;
-  my @args=@_;
-  for ($Zll,$Zlh,$Zhl,$Zhh) {
-    $code.=<<___;
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
-        rev     $_,$_
-        str     $_,[$Xi,#$i]
-#elif defined(__ARMEB__)
-        str     $_,[$Xi,#$i]
-#else
-        mov     $Tlh,$_,lsr#8
-        strb    $_,[$Xi,#$i+3]
-        mov     $Thl,$_,lsr#16
-        strb    $Tlh,[$Xi,#$i+2]
-        mov     $Thh,$_,lsr#24
-        strb    $Thl,[$Xi,#$i+1]
-        strb    $Thh,[$Xi,#$i]
-#endif
-___
-    $code.="\t".shift(@args)."\n";
-    $i-=4;
-  }
-}
-$code=<<___;
-#include "arm_arch.h"
-.text
-.code   32
-.type   rem_4bit,%object
-.align  5
-rem_4bit:
-.short  0x0000,0x1C20,0x3840,0x2460
-.short  0x7080,0x6CA0,0x48C0,0x54E0
-.short  0xE100,0xFD20,0xD940,0xC560
-.short  0x9180,0x8DA0,0xA9C0,0xB5E0
-.size   rem_4bit,.-rem_4bit
-.type   rem_4bit_get,%function
-rem_4bit_get:
-        sub     $rem_4bit,pc,#8
-        sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
-        b       .Lrem_4bit_got
-        nop
-.size   rem_4bit_get,.-rem_4bit_get
-.global gcm_ghash_4bit
-.type   gcm_ghash_4bit,%function
-gcm_ghash_4bit:
-        sub     r12,pc,#8
-        add     $len,$inp,$len          @ $len to point at the end
-        stmdb   sp!,{r3-r11,lr}         @ save $len/end too
-        sub     r12,r12,#48             @ &rem_4bit
-        ldmia   r12,{r4-r11}            @ copy rem_4bit ...
-        stmdb   sp!,{r4-r11}            @ ... to stack
-        ldrb    $nlo,[$inp,#15]
-        ldrb    $nhi,[$Xi,#15]
-.Louter:
-        eor     $nlo,$nlo,$nhi
-        and     $nhi,$nlo,#0xf0
-        and     $nlo,$nlo,#0x0f
-        mov     $cnt,#14
-        add     $Zhh,$Htbl,$nlo,lsl#4
-        ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
-        add     $Thh,$Htbl,$nhi
-        ldrb    $nlo,[$inp,#14]
-        and     $nhi,$Zll,#0xf          @ rem
-        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
-        add     $nhi,$nhi,$nhi
-        eor     $Zll,$Tll,$Zll,lsr#4
-        ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
-        eor     $Zll,$Zll,$Zlh,lsl#28
-        ldrb    $nhi,[$Xi,#14]
-        eor     $Zlh,$Tlh,$Zlh,lsr#4
-        eor     $Zlh,$Zlh,$Zhl,lsl#28
-        eor     $Zhl,$Thl,$Zhl,lsr#4
-        eor     $Zhl,$Zhl,$Zhh,lsl#28
-        eor     $Zhh,$Thh,$Zhh,lsr#4
-        eor     $nlo,$nlo,$nhi
-        and     $nhi,$nlo,#0xf0
-        and     $nlo,$nlo,#0x0f
-        eor     $Zhh,$Zhh,$Tll,lsl#16
-.Linner:
-        add     $Thh,$Htbl,$nlo,lsl#4
-        and     $nlo,$Zll,#0xf          @ rem
-        subs    $cnt,$cnt,#1
-        add     $nlo,$nlo,$nlo
-        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
-        eor     $Zll,$Tll,$Zll,lsr#4
-        eor     $Zll,$Zll,$Zlh,lsl#28
-        eor     $Zlh,$Tlh,$Zlh,lsr#4
-        eor     $Zlh,$Zlh,$Zhl,lsl#28
-        ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
-        eor     $Zhl,$Thl,$Zhl,lsr#4
-        ldrplb  $nlo,[$inp,$cnt]
-        eor     $Zhl,$Zhl,$Zhh,lsl#28
-        eor     $Zhh,$Thh,$Zhh,lsr#4
-        add     $Thh,$Htbl,$nhi
-        and     $nhi,$Zll,#0xf          @ rem
-        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
-        add     $nhi,$nhi,$nhi
-        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
-        eor     $Zll,$Tll,$Zll,lsr#4
-        ldrplb  $Tll,[$Xi,$cnt]
-        eor     $Zll,$Zll,$Zlh,lsl#28
-        eor     $Zlh,$Tlh,$Zlh,lsr#4
-        ldrh    $Tlh,[sp,$nhi]
-        eor     $Zlh,$Zlh,$Zhl,lsl#28
-        eor     $Zhl,$Thl,$Zhl,lsr#4
-        eor     $Zhl,$Zhl,$Zhh,lsl#28
-        eorpl   $nlo,$nlo,$Tll
-        eor     $Zhh,$Thh,$Zhh,lsr#4
-        andpl   $nhi,$nlo,#0xf0
-        andpl   $nlo,$nlo,#0x0f
-        eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
-        bpl     .Linner
-        ldr     $len,[sp,#32]           @ re-load $len/end
-        add     $inp,$inp,#16
-        mov     $nhi,$Zll
-___
-        &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
-$code.=<<___;
-        bne     .Louter
-        add     sp,sp,#36
-#if __ARM_ARCH__>=5
-        ldmia   sp!,{r4-r11,pc}
-#else
-        ldmia   sp!,{r4-r11,lr}
-        tst     lr,#1
-        moveq   pc,lr                   @ be binary compatible with V4, yet
-        bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size   gcm_ghash_4bit,.-gcm_ghash_4bit
-.global gcm_gmult_4bit
-.type   gcm_gmult_4bit,%function
-gcm_gmult_4bit:
-        stmdb   sp!,{r4-r11,lr}
-        ldrb    $nlo,[$Xi,#15]
-        b       rem_4bit_get
-.Lrem_4bit_got:
-        and     $nhi,$nlo,#0xf0
-        and     $nlo,$nlo,#0x0f
-        mov     $cnt,#14
-        add     $Zhh,$Htbl,$nlo,lsl#4
-        ldmia   $Zhh,{$Zll-$Zhh}        @ load Htbl[nlo]
-        ldrb    $nlo,[$Xi,#14]
-        add     $Thh,$Htbl,$nhi
-        and     $nhi,$Zll,#0xf          @ rem
-        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
-        add     $nhi,$nhi,$nhi
-        eor     $Zll,$Tll,$Zll,lsr#4
-        ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
-        eor     $Zll,$Zll,$Zlh,lsl#28
-        eor     $Zlh,$Tlh,$Zlh,lsr#4
-        eor     $Zlh,$Zlh,$Zhl,lsl#28
-        eor     $Zhl,$Thl,$Zhl,lsr#4
-        eor     $Zhl,$Zhl,$Zhh,lsl#28
-        eor     $Zhh,$Thh,$Zhh,lsr#4
-        and     $nhi,$nlo,#0xf0
-        eor     $Zhh,$Zhh,$Tll,lsl#16
-        and     $nlo,$nlo,#0x0f
-.Loop:
-        add     $Thh,$Htbl,$nlo,lsl#4
-        and     $nlo,$Zll,#0xf          @ rem
-        subs    $cnt,$cnt,#1
-        add     $nlo,$nlo,$nlo
-        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
-        eor     $Zll,$Tll,$Zll,lsr#4
-        eor     $Zll,$Zll,$Zlh,lsl#28
-        eor     $Zlh,$Tlh,$Zlh,lsr#4
-        eor     $Zlh,$Zlh,$Zhl,lsl#28
-        ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
-        eor     $Zhl,$Thl,$Zhl,lsr#4
-        ldrplb  $nlo,[$Xi,$cnt]
-        eor     $Zhl,$Zhl,$Zhh,lsl#28
-        eor     $Zhh,$Thh,$Zhh,lsr#4
-        add     $Thh,$Htbl,$nhi
-        and     $nhi,$Zll,#0xf          @ rem
-        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
-        add     $nhi,$nhi,$nhi
-        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
-        eor     $Zll,$Tll,$Zll,lsr#4
-        eor     $Zll,$Zll,$Zlh,lsl#28
-        eor     $Zlh,$Tlh,$Zlh,lsr#4
-        ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
-        eor     $Zlh,$Zlh,$Zhl,lsl#28
-        eor     $Zhl,$Thl,$Zhl,lsr#4
-        eor     $Zhl,$Zhl,$Zhh,lsl#28
-        eor     $Zhh,$Thh,$Zhh,lsr#4
-        andpl   $nhi,$nlo,#0xf0
-        andpl   $nlo,$nlo,#0x0f
-        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
-        bpl     .Loop
-___
-        &Zsmash();
-$code.=<<___;
-#if __ARM_ARCH__>=5
-        ldmia   sp!,{r4-r11,pc}
-#else
-        ldmia   sp!,{r4-r11,lr}
-        tst     lr,#1
-        moveq   pc,lr                   @ be binary compatible with V4, yet
-        bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
-.size   gcm_gmult_4bit,.-gcm_gmult_4bit
-___
-{
-my $cnt=$Htbl;  # $Htbl is used once in the very beginning
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
-$code.=<<___;
-#if __ARM_ARCH__>=7
-.fpu    neon
-.global gcm_gmult_neon
-.type   gcm_gmult_neon,%function
-.align  4
-gcm_gmult_neon:
-        sub             $Htbl,#16               @ point at H in GCM128_CTX
-        vld1.64         `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
-        vmov.i32        $mod,#0xe1              @ our irreducible polynomial
-        vld1.64         `&Dlo("$IN")`,[$Xi,:64]!
-        vshr.u64        $mod,#32
-        vldmia          $Htbl,{$Hhi-$Hlo}       @ load H
-        veor            $zero,$zero
-#ifdef __ARMEL__
-        vrev64.8        $IN,$IN
-#endif
-        veor            $Qpost,$Qpost
-        veor            $R,$R
-        mov             $cnt,#16
-        veor            $Z,$Z
-        mov             $len,#16
-        veor            $Zo,$Zo
-        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
-        b               .Linner_neon
-.size   gcm_gmult_neon,.-gcm_gmult_neon
-.global gcm_ghash_neon
-.type   gcm_ghash_neon,%function
-.align  4
-gcm_ghash_neon:
-        vld1.64         `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
-        vmov.i32        $mod,#0xe1              @ our irreducible polynomial
-        vld1.64         `&Dlo("$Z")`,[$Xi,:64]!
-        vshr.u64        $mod,#32
-        vldmia          $Xi,{$Hhi-$Hlo}         @ load H
-        veor            $zero,$zero
-        nop
-#ifdef __ARMEL__
-        vrev64.8        $Z,$Z
-#endif
-.Louter_neon:
-        vld1.64         `&Dhi($IN)`,[$inp]!     @ load inp
-        veor            $Qpost,$Qpost
-        vld1.64         `&Dlo($IN)`,[$inp]!
-        veor            $R,$R
-        mov             $cnt,#16
-#ifdef __ARMEL__
-        vrev64.8        $IN,$IN
-#endif
-        veor            $Zo,$Zo
-        veor            $IN,$Z                  @ inp^=Xi
-        veor            $Z,$Z
-        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
-.Linner_neon:
-        subs            $cnt,$cnt,#1
-        vmull.p8        $Qlo,$Hlo,$xi           @ H.lo�Xi[i]
-        vmull.p8        $Qhi,$Hhi,$xi           @ H.hi�Xi[i]
-        vext.8          $IN,$zero,#1            @ IN>>=8
-        veor            $Z,$Qpost               @ modulo-scheduled part
-        vshl.i64        `&Dlo("$R")`,#48
-        vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
-        veor            $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-        veor            `&Dhi("$Z")`,`&Dlo("$R")`
-        vuzp.8          $Qlo,$Qhi
-        vsli.8          $Zo,$T,#1               @ compose the "carry" byte
-        vext.8          $Z,$zero,#1             @ Z>>=8
-        vmull.p8        $R,$Zo,$mod             @ "carry"�0xe1
-        vshr.u8         $Zo,$T,#7               @ save Z's bottom bit
-        vext.8          $Qpost,$Qlo,$zero,#1    @ Qlo>>=8
-        veor            $Z,$Qhi
-        bne             .Linner_neon
-        veor            $Z,$Qpost               @ modulo-scheduled artefact
-        vshl.i64        `&Dlo("$R")`,#48
-        veor            `&Dhi("$Z")`,`&Dlo("$R")`
-        @ finalization, normalize Z:Zo
-        vand            $Zo,$mod                @ suffices to mask the bit
-        vshr.u64        `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
-        vshl.i64        $Z,#1
-        subs            $len,#16
-        vorr            $Z,`&Q("$Zo")`          @ Z=Z:Zo<<1
-        bne             .Louter_neon
-#ifdef __ARMEL__
-        vrev64.8        $Z,$Z
-#endif
-        sub             $Xi,#16 
-        vst1.64         `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
-        vst1.64         `&Dlo("$Z")`,[$Xi,:64]
-        bx      lr
-.size   gcm_ghash_neon,.-gcm_ghash_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align  2
-___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
-print $code;
-close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
deleted file mode 100755
index 0354c95444..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-ia64.pl
+++ /dev/null
@@ -1,463 +0,0 @@
-#!/usr/bin/env perl
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
-# GHASH performance was measured to be 6.67 cycles per processed byte
-# on Itanium 2, which is >90% better than Microsoft compiler generated
-# code. To anchor to something else sha1-ia64.pl module processes one
-# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
-# byte.
-# September 2010
-#
-# It was originally thought that it makes lesser sense to implement
-# "528B" variant on Itanium 2 for following reason. Because number of
-# functional units is naturally limited, it appeared impossible to
-# implement "528B" loop in 4 cycles, only in 5. This would mean that
-# theoretically performance improvement couldn't be more than 20%.
-# But occasionally you prove yourself wrong:-) I figured out a way to
-# fold couple of instructions and having freed yet another instruction
-# slot by unrolling the loop... Resulting performance is 4.45 cycles
-# per processed byte and 50% better than "256B" version. On original
-# Itanium performance should remain the same as the "256B" version,
-# i.e. ~8.5 cycles.
-$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
-if ($^O eq "hpux") {
-    $ADDP="addp4";
-    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
-} else { $ADDP="add"; }
-for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
-                $big_endian=0 if (/\-DL_ENDIAN/);  }
-if (!defined($big_endian))
-             {  $big_endian=(unpack('L',pack('N',1))==1);  }
-sub loop() {
-my $label=shift;
-my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
-# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
-# in scalable manner;-) Naturally assuming data in L1 cache...
-# Special note about 'dep' instruction, which is used to construct
-# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
-# bytes boundary and lower 7 bits of its address are guaranteed to
-# be zero.
-$code.=<<___;
-$label:
-{ .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
-        (p19)   dep     rem=Zlo,rem_4bitp,3,4   }
-{ .mfi; (p19)   xor     Zhi=Zhi,Hhi
-        ($p17)  xor     xi[1]=xi[1],in[1]       };;
-{ .mfi; (p18)   ld8     Hhi=[Hi[1]]
-        (p19)   shrp    Zlo=Zhi,Zlo,4           }
-{ .mfi; (p19)   ld8     rem=[rem]
-        (p18)   and     Hi[1]=mask0xf0,xi[2]    };;
-{ .mmi; ($p16)  ld1     in[0]=[inp],-1
-        (p18)   xor     Zlo=Zlo,Hlo
-        (p19)   shr.u   Zhi=Zhi,4               }
-{ .mib; (p19)   xor     Hhi=Hhi,rem
-        (p18)   add     Hi[1]=Htbl,Hi[1]        };;
-{ .mfi; (p18)   ld8     Hlo=[Hi[1]],-8
-        (p18)   dep     rem=Zlo,rem_4bitp,3,4   }
-{ .mfi; (p17)   shladd  Hi[0]=xi[1],4,r0
-        (p18)   xor     Zhi=Zhi,Hhi             };;
-{ .mfi; (p18)   ld8     Hhi=[Hi[1]]
-        (p18)   shrp    Zlo=Zhi,Zlo,4           }
-{ .mfi; (p18)   ld8     rem=[rem]
-        (p17)   and     Hi[0]=mask0xf0,Hi[0]    };;
-{ .mmi; (p16)   ld1     xi[0]=[Xi],-1
-        (p18)   xor     Zlo=Zlo,Hlo
-        (p18)   shr.u   Zhi=Zhi,4               }
-{ .mib; (p18)   xor     Hhi=Hhi,rem
-        (p17)   add     Hi[0]=Htbl,Hi[0]
-        br.ctop.sptk    $label                  };;
-___
-}
-$code=<<___;
-.explicit
-.text
-prevfs=r2;      prevlc=r3;      prevpr=r8;
-mask0xf0=r21;
-rem=r22;        rem_4bitp=r23;
-Xi=r24;         Htbl=r25;
-inp=r26;        end=r27;
-Hhi=r28;        Hlo=r29;
-Zhi=r30;        Zlo=r31;
-.align  128
-.skip   16                                      // aligns loop body
-.global gcm_gmult_4bit#
-.proc   gcm_gmult_4bit#
-gcm_gmult_4bit:
-        .prologue
-{ .mmi; .save   ar.pfs,prevfs
-        alloc   prevfs=ar.pfs,2,6,0,8
-        $ADDP   Xi=15,in0                       // &Xi[15]
-        mov     rem_4bitp=ip            }
-{ .mii; $ADDP   Htbl=8,in1                      // &Htbl[0].lo
-        .save   ar.lc,prevlc
-        mov     prevlc=ar.lc
-        .save   pr,prevpr
-        mov     prevpr=pr               };;
-        .body
-        .rotr   in[3],xi[3],Hi[2]
-{ .mib; ld1     xi[2]=[Xi],-1                   // Xi[15]
-        mov     mask0xf0=0xf0
-        brp.loop.imp    .Loop1,.Lend1-16};;
-{ .mmi; ld1     xi[1]=[Xi],-1                   // Xi[14]
-                                        };;
-{ .mii; shladd  Hi[1]=xi[2],4,r0
-        mov     pr.rot=0x7<<16
-        mov     ar.lc=13                };;
-{ .mii; and     Hi[1]=mask0xf0,Hi[1]
-        mov     ar.ec=3
-        xor     Zlo=Zlo,Zlo             };;
-{ .mii; add     Hi[1]=Htbl,Hi[1]                // &Htbl[nlo].lo
-        add     rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
-        xor     Zhi=Zhi,Zhi             };;
-___
-        &loop   (".Loop1",1);
-$code.=<<___;
-.Lend1:
-{ .mib; xor     Zhi=Zhi,Hhi             };;     // modulo-scheduling artefact
-{ .mib; mux1    Zlo=Zlo,\@rev           };;
-{ .mib; mux1    Zhi=Zhi,\@rev           };;
-{ .mmi; add     Hlo=9,Xi;;                      // ;; is here to prevent
-        add     Hhi=1,Xi                };;     // pipeline flush on Itanium
-{ .mib; st8     [Hlo]=Zlo
-        mov     pr=prevpr,0x1ffff       };;
-{ .mib; st8     [Hhi]=Zhi
-        mov     ar.lc=prevlc
-        br.ret.sptk.many        b0      };;
-.endp   gcm_gmult_4bit#
-___
-######################################################################
-# "528B" (well, "512B" actualy) streamed GHASH
-#
-$Xip="in0";
-$Htbl="in1";
-$inp="in2";
-$len="in3";
-$rem_8bit="loc0";
-$mask0xff="loc1";
-($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
-sub load_htable() {
-    for (my $i=0;$i<8;$i++) {
-        $code.=<<___;
-{ .mmi; ld8     r`16+2*$i+1`=[r8],16            // Htable[$i].hi
-        ld8     r`16+2*$i`=[r9],16      }       // Htable[$i].lo
-{ .mmi; ldf8    f`32+2*$i+1`=[r10],16           // Htable[`8+$i`].hi
-        ldf8    f`32+2*$i`=[r11],16             // Htable[`8+$i`].lo
-___
-        $code.=shift    if (($i+$#_)==7);
-        $code.="\t};;\n"
-    }
-}
-$code.=<<___;
-prevsp=r3;
-.align  32
-.skip   16                                      // aligns loop body
-.global gcm_ghash_4bit#
-.proc   gcm_ghash_4bit#
-gcm_ghash_4bit:
-        .prologue
-{ .mmi; .save   ar.pfs,prevfs
-        alloc   prevfs=ar.pfs,4,2,0,0
-        .vframe prevsp
-        mov     prevsp=sp
-        mov     $rem_8bit=ip            };;
-        .body
-{ .mfi; $ADDP   r8=0+0,$Htbl
-        $ADDP   r9=0+8,$Htbl            }
-{ .mfi; $ADDP   r10=128+0,$Htbl
-        $ADDP   r11=128+8,$Htbl         };;
-___
-        &load_htable(
-        "       $ADDP   $Xip=15,$Xip",          # &Xi[15]
-        "       $ADDP   $len=$len,$inp",        # &inp[len]
-        "       $ADDP   $inp=15,$inp",          # &inp[15]
-        "       mov     $mask0xff=0xff",
-        "       add     sp=-512,sp",
-        "       andcm   sp=sp,$mask0xff",       # align stack frame
-        "       add     r14=0,sp",
-        "       add     r15=8,sp");
-$code.=<<___;
-{ .mmi; $sum    1<<1                            // go big-endian
-        add     r8=256+0,sp
-        add     r9=256+8,sp             }
-{ .mmi; add     r10=256+128+0,sp
-        add     r11=256+128+8,sp
-        add     $len=-17,$len           };;
-___
-for($i=0;$i<8;$i++) {   # generate first half of Hshr4[]
-my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
-$code.=<<___;
-{ .mmi; st8     [r8]=$rlo,16                    // Htable[$i].lo
-        st8     [r9]=$rhi,16                    // Htable[$i].hi
-        shrp    $rlo=$rhi,$rlo,4        }//;;
-{ .mmi; stf8    [r10]=f`32+2*$i`,16             // Htable[`8+$i`].lo
-        stf8    [r11]=f`32+2*$i+1`,16           // Htable[`8+$i`].hi
-        shr.u   $rhi=$rhi,4             };;
-{ .mmi; st8     [r14]=$rlo,16                   // Htable[$i].lo>>4
-        st8     [r15]=$rhi,16           }//;;   // Htable[$i].hi>>4
-___
-}
-$code.=<<___;
-{ .mmi; ld8     r16=[r8],16                     // Htable[8].lo
-        ld8     r17=[r9],16             };;     // Htable[8].hi
-{ .mmi; ld8     r18=[r8],16                     // Htable[9].lo
-        ld8     r19=[r9],16             }       // Htable[9].hi
-{ .mmi; rum     1<<5                            // clear um.mfh
-        shrp    r16=r17,r16,4           };;
-___
-for($i=0;$i<6;$i++) {   # generate second half of Hshr4[]
-$code.=<<___;
-{ .mmi; ld8     r`20+2*$i`=[r8],16              // Htable[`10+$i`].lo
-        ld8     r`20+2*$i+1`=[r9],16            // Htable[`10+$i`].hi
-        shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
-{ .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
-        st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
-        shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
-___
-}
-$code.=<<___;
-{ .mmi; shr.u   r`16+2*$i+1`=r`16+2*$i+1`,4     };;
-{ .mmi; st8     [r14]=r`16+2*$i`,16             // Htable[`8+$i`].lo>>4
-        st8     [r15]=r`16+2*$i+1`,16           // Htable[`8+$i`].hi>>4
-        shrp    r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4    }
-{ .mmi; add     $Htbl=256,sp                    // &Htable[0]
-        add     $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
-        shr.u   r`18+2*$i+1`=r`18+2*$i+1`,4     };;
-{ .mmi; st8     [r14]=r`18+2*$i`                // Htable[`8+$i`].lo>>4
-        st8     [r15]=r`18+2*$i+1`      }       // Htable[`8+$i`].hi>>4
-___
-$in="r15";
-@xi=("r16","r17");
-@rem=("r18","r19");
-($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
-($Atbl,$Btbl)=("r26","r27");
-$code.=<<___;   # (p16)
-{ .mmi; ld1     $in=[$inp],-1                   //(p16) *inp--
-        ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
-        cmp.eq  p0,p6=r0,r0             };;     //      clear p6
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-$code.=<<___;   # (p16),(p17)
-{ .mmi; ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
-        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
-{ .mii; ld1     $in=[$inp],-1                   //(p16) *inp--
-        dep     $Atbl=$xi[1],$Htbl,4,4          //(p17) &Htable[nlo].lo
-        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
-.align  32
-.LOOP:
-{ .mmi;
-(p6)    st8     [$Xip]=$Zhi,13
-        xor     $Zlo=$Zlo,$Zlo
-        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi].lo
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-$code.=<<___;   # (p16),(p17),(p18)
-{ .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
-        ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
-        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
-{ .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
-        dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
-{ .mfi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
-        xor     $Zlo=$Zlo,$Alo          };;     //(p18) Z.lo^=Htable[nlo].lo
-{ .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
-        ld1     $in=[$inp],-1           }       //(p16) *inp--
-{ .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
-        mov     $Zhi=$Ahi                       //(p18) Z.hi^=Htable[nlo].hi
-        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
-{ .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
-        ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
-        shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
-{ .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
-        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-for ($i=1;$i<14;$i++) {
-# Above and below fragments are derived from this one by removing
-# unsuitable (p??) instructions.
-$code.=<<___;   # (p16),(p17),(p18),(p19)
-{ .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
-        ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
-        shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
-{ .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
-        xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
-        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
-{ .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
-        ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
-        dep     $Atbl=$xi[1],$Htbl,4,4  }       //(p17) &Htable[nlo].lo
-{ .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
-        xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
-        xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
-{ .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
-        ld1     $in=[$inp],-1                   //(p16) *inp--
-        shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
-{ .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
-        xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
-        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
-{ .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
-        ld1     $xi[0]=[$Xip],-1                //(p16) *Xi--
-        shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
-{ .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
-        xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
-        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-}
-$code.=<<___;   # (p17),(p18),(p19)
-{ .mmi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
-        ld8     $rem[0]=[$Btbl],-256            //(p18) Htable[nhi].lo,&Hshr4[nhi].lo
-        shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
-{ .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
-        xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
-        xor     $xi[1]=$xi[1],$in       };;     //(p17) xi=$xi[i]^inp[i]
-{ .mmi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
-        ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
-        dep     $Atbl=$xi[1],$Htbl,4,4  };;     //(p17) &Htable[nlo].lo
-{ .mmi; shladd  $rem[0]=$rem[0],4,r0            //(p18) Htable[nhi].lo<<4
-        xor     $Zlo=$Zlo,$Alo                  //(p18) Z.lo^=Htable[nlo].lo
-        xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
-{ .mmi; ld8     $Blo=[$Btbl],8                  //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
-        shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
-{ .mmi; xor     $rem[0]=$rem[0],$Zlo            //(p18) Z.lo^(Htable[nhi].lo<<4)
-        xor     $Zhi=$Zhi,$Ahi                  //(p18) Z.hi^=Htable[nlo].hi
-        and     $xi[1]=-16,$xi[1]       };;     //(p17) nhi=xi&0xf0
-{ .mmi; ld8     $Bhi=[$Btbl]                    //(p18) Hshr4[nhi].hi
-        shrp    $Zlo=$Zhi,$Zlo,8        }       //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
-{ .mmi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
-        xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
-        add     $Btbl=$xi[1],$Htbl      };;     //(p17) &Htable[nhi]
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-$code.=<<___;   # (p18),(p19)
-{ .mfi; ld8     $Alo=[$Atbl],8                  //(p18) Htable[nlo].lo,&Htable[nlo].hi
-        shr.u   $Zhi=$Zhi,8             }       //(p19) Z.hi>>=8
-{ .mfi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
-        xor     $Zlo=$Zlo,$Blo          };;     //(p19) Z.lo^=Hshr4[nhi].lo
-{ .mfi; ld8     $Ahi=[$Atbl]                    //(p18) Htable[nlo].hi
-        xor     $Zlo=$Zlo,$Alo          }       //(p18) Z.lo^=Htable[nlo].lo
-{ .mfi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
-        xor     $Zhi=$Zhi,$Bhi          };;     //(p19) Z.hi^=Hshr4[nhi].hi
-{ .mfi; ld8     $Blo=[$Btbl],8                  //(p18) Htable[nhi].lo,&Htable[nhi].hi
-        shl     $rem[1]=$rem[1],48      }       //(p19) rem_8bit[rem]<<48
-{ .mfi; shladd  $rem[0]=$Zlo,4,r0               //(p18) Z.lo<<4
-        xor     $Zhi=$Zhi,$Ahi          };;     //(p18) Z.hi^=Htable[nlo].hi
-{ .mfi; ld8     $Bhi=[$Btbl]                    //(p18) Htable[nhi].hi
-        shrp    $Zlo=$Zhi,$Zlo,4        }       //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
-{ .mfi; and     $rem[0]=$rem[0],$mask0xff       //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
-        xor     $Zhi=$Zhi,$rem[1]       };;     //(p19) Z.hi^=rem_8bit[rem]<<48
-___
-push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers
-$code.=<<___;   # (p19)
-{ .mmi; cmp.ltu p6,p0=$inp,$len
-        add     $inp=32,$inp
-        shr.u   $Zhi=$Zhi,4             }       //(p19) Z.hi>>=4
-{ .mmi; shladd  $rem[1]=$rem[1],1,$rem_8bit     //(p19) &rem_8bit[rem]
-        xor     $Zlo=$Zlo,$Blo                  //(p19) Z.lo^=Hshr4[nhi].lo
-        add     $Xip=9,$Xip             };;     //      &Xi.lo
-{ .mmi; ld2     $rem[1]=[$rem[1]]               //(p19) rem_8bit[rem]
-(p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
-(p6)    extr.u  $xi[1]=$Zlo,8,8         }       //[p17] Xi[14]
-{ .mmi; xor     $Zhi=$Zhi,$Bhi                  //(p19) Z.hi^=Hshr4[nhi].hi
-(p6)    and     $xi[0]=$Zlo,$mask0xff   };;     //[p16] Xi[15]
-{ .mmi; st8     [$Xip]=$Zlo,-8
-(p6)    xor     $xi[0]=$xi[0],$in               //[p17] xi=$xi[i]^inp[i]
-        shl     $rem[1]=$rem[1],48      };;     //(p19) rem_8bit[rem]<<48
-{ .mmi;
-(p6)    ld1     $in=[$inp],-1                   //[p16] *inp--
-        xor     $Zhi=$Zhi,$rem[1]               //(p19) Z.hi^=rem_8bit[rem]<<48
-(p6)    dep     $Atbl=$xi[0],$Htbl,4,4  }       //[p17] &Htable[nlo].lo
-{ .mib;
-(p6)    and     $xi[0]=-16,$xi[0]               //[p17] nhi=xi&0xf0
-(p6)    br.cond.dptk.many       .LOOP   };;
-{ .mib; st8     [$Xip]=$Zhi             };;
-{ .mib; $rum    1<<1                            // return to little-endian
-        .restore        sp
-        mov     sp=prevsp
-        br.ret.sptk.many        b0      };;
-.endp   gcm_ghash_4bit#
-___
-$code.=<<___;
-.align  128
-.type   rem_4bit#,\@object
-rem_4bit:
-        data8   0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
-        data8   0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
-        data8   0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
-        data8   0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
-.size   rem_4bit#,128
-.type   rem_8bit#,\@object
-rem_8bit:
-        data1   0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
-        data1   0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
-        data1   0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
-        data1   0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
-        data1   0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
-        data1   0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
-        data1   0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
-        data1   0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
-        data1   0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
-        data1   0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
-        data1   0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
-        data1   0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
-        data1   0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
-        data1   0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
-        data1   0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
-        data1   0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
-        data1   0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
-        data1   0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
-        data1   0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
-        data1   0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
-        data1   0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
-        data1   0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
-        data1   0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
-        data1   0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
-        data1   0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
-        data1   0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
-        data1   0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
-        data1   0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
-        data1   0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
-        data1   0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
-        data1   0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
-        data1   0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
-.size   rem_8bit#,512
-stringz "GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
-___
-$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
deleted file mode 100644
index 965802d3fa..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-parisc.pl
+++ /dev/null
@@ -1,741 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# April 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
-# it processes one byte in 19.6 cycles, which is more than twice as
-# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
-# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
-# processed byte. This is ~2.2x faster than 64-bit code generated by
-# vendor compiler (which used to be very hard to beat:-).
-#
-# Special thanks to polarhome.com for providing HP-UX account.
-$flavour = shift;
-$output = shift;
-open STDOUT,">$output";
-if ($flavour =~ /64/) {
-        $LEVEL          ="2.0W";
-        $SIZE_T         =8;
-        $FRAME_MARKER   =80;
-        $SAVED_RP       =16;
-        $PUSH           ="std";
-        $PUSHMA         ="std,ma";
-        $POP            ="ldd";
-        $POPMB          ="ldd,mb";
-        $NREGS          =6;
-} else {
-        $LEVEL          ="1.0"; #"\n\t.ALLOW\t2.0";
-        $SIZE_T         =4;
-        $FRAME_MARKER   =48;
-        $SAVED_RP       =20;
-        $PUSH           ="stw";
-        $PUSHMA         ="stwm";
-        $POP            ="ldw";
-        $POPMB          ="ldwm";
-        $NREGS          =11;
-}
-$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
-                                #                 [+ argument transfer]
-################# volatile registers
-$Xi="%r26";     # argument block
-$Htbl="%r25";
-$inp="%r24";
-$len="%r23";
-$Hhh=$Htbl;     # variables
-$Hll="%r22";
-$Zhh="%r21";
-$Zll="%r20";
-$cnt="%r19";
-$rem_4bit="%r28";
-$rem="%r29";
-$mask0xf0="%r31";
-################# preserved registers
-$Thh="%r1";
-$Tll="%r2";
-$nlo="%r3";
-$nhi="%r4";
-$byte="%r5";
-if ($SIZE_T==4) {
-        $Zhl="%r6";
-        $Zlh="%r7";
-        $Hhl="%r8";
-        $Hlh="%r9";
-        $Thl="%r10";
-        $Tlh="%r11";
-}
-$rem2="%r6";    # used in PA-RISC 2.0 code
-$code.=<<___;
-        .LEVEL  $LEVEL
-#if 0
-        .SPACE  \$TEXT\$
-        .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
-#else
-        .text
-#endif
-        .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
-        .ALIGN  64
-gcm_gmult_4bit
-        .PROC
-        .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
-        .ENTRY
-        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
-        $PUSHMA %r3,$FRAME(%sp)
-        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
-        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
-        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
-___
-$code.=<<___ if ($SIZE_T==4);
-        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
-        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
-        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
-        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
-        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
-___
-$code.=<<___;
-        blr     %r0,$rem_4bit
-        ldi     3,$rem
-L\$pic_gmult
-        andcm   $rem_4bit,$rem,$rem_4bit
-        addl    $inp,$len,$len
-        ldo     L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
-        ldi     0xf0,$mask0xf0
-___
-$code.=<<___ if ($SIZE_T==4);
-#ifndef __OpenBSD__
-        ldi     31,$rem
-        mtctl   $rem,%cr11
-        extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
-        b       L\$parisc1_gmult
-        nop
-___
-$code.=<<___;
-        ldb     15($Xi),$nlo
-        ldo     8($Htbl),$Hll
-        and     $mask0xf0,$nlo,$nhi
-        depd,z  $nlo,59,4,$nlo
-        ldd     $nlo($Hll),$Zll
-        ldd     $nlo($Hhh),$Zhh
-        depd,z  $Zll,60,4,$rem
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldb     14($Xi),$nlo
-        ldd     $nhi($Hll),$Tll
-        ldd     $nhi($Hhh),$Thh
-        and     $mask0xf0,$nlo,$nhi
-        depd,z  $nlo,59,4,$nlo
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        ldd     $rem($rem_4bit),$rem
-        b       L\$oop_gmult_pa2
-        ldi     13,$cnt
-        .ALIGN  8
-L\$oop_gmult_pa2
-        xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
-        depd,z  $Zll,60,4,$rem
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldd     $nlo($Hll),$Tll
-        ldd     $nlo($Hhh),$Thh
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        ldd     $rem($rem_4bit),$rem
-        xor     $rem,$Zhh,$Zhh
-        depd,z  $Zll,60,4,$rem
-        ldbx    $cnt($Xi),$nlo
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldd     $nhi($Hll),$Tll
-        ldd     $nhi($Hhh),$Thh
-        and     $mask0xf0,$nlo,$nhi
-        depd,z  $nlo,59,4,$nlo
-        ldd     $rem($rem_4bit),$rem
-        xor     $Tll,$Zll,$Zll
-        addib,uv -1,$cnt,L\$oop_gmult_pa2
-        xor     $Thh,$Zhh,$Zhh
-        xor     $rem,$Zhh,$Zhh
-        depd,z  $Zll,60,4,$rem
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldd     $nlo($Hll),$Tll
-        ldd     $nlo($Hhh),$Thh
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        ldd     $rem($rem_4bit),$rem
-        xor     $rem,$Zhh,$Zhh
-        depd,z  $Zll,60,4,$rem
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldd     $nhi($Hll),$Tll
-        ldd     $nhi($Hhh),$Thh
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        ldd     $rem($rem_4bit),$rem
-        xor     $rem,$Zhh,$Zhh
-        std     $Zll,8($Xi)
-        std     $Zhh,0($Xi)
-___
-$code.=<<___ if ($SIZE_T==4);
-        b       L\$done_gmult
-        nop
-L\$parisc1_gmult
-#endif
-        ldb     15($Xi),$nlo
-        ldo     12($Htbl),$Hll
-        ldo     8($Htbl),$Hlh
-        ldo     4($Htbl),$Hhl
-        and     $mask0xf0,$nlo,$nhi
-        zdep    $nlo,27,4,$nlo
-        ldwx    $nlo($Hll),$Zll
-        ldwx    $nlo($Hlh),$Zlh
-        ldwx    $nlo($Hhl),$Zhl
-        ldwx    $nlo($Hhh),$Zhh
-        zdep    $Zll,28,4,$rem
-        ldb     14($Xi),$nlo
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $nhi($Hll),$Tll
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        ldwx    $nhi($Hlh),$Tlh
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        ldwx    $nhi($Hhl),$Thl
-        extru   $Zhh,27,28,$Zhh
-        ldwx    $nhi($Hhh),$Thh
-        xor     $rem,$Zhh,$Zhh
-        and     $mask0xf0,$nlo,$nhi
-        zdep    $nlo,27,4,$nlo
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nlo($Hll),$Tll
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nlo($Hlh),$Tlh
-        xor     $Thl,$Zhl,$Zhl
-        b       L\$oop_gmult_pa1
-        ldi     13,$cnt
-        .ALIGN  8
-L\$oop_gmult_pa1
-        zdep    $Zll,28,4,$rem
-        ldwx    $nlo($Hhl),$Thl
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $nlo($Hhh),$Thh
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        ldbx    $cnt($Xi),$nlo
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nhi($Hll),$Tll
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nhi($Hlh),$Tlh
-        extru   $Zhh,27,28,$Zhh
-        xor     $Thl,$Zhl,$Zhl
-        ldwx    $nhi($Hhl),$Thl
-        xor     $rem,$Zhh,$Zhh
-        zdep    $Zll,28,4,$rem
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $nhi($Hhh),$Thh
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        and     $mask0xf0,$nlo,$nhi
-        extru   $Zhh,27,28,$Zhh
-        zdep    $nlo,27,4,$nlo
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nlo($Hll),$Tll
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nlo($Hlh),$Tlh
-        xor     $rem,$Zhh,$Zhh
-        addib,uv -1,$cnt,L\$oop_gmult_pa1
-        xor     $Thl,$Zhl,$Zhl
-        zdep    $Zll,28,4,$rem
-        ldwx    $nlo($Hhl),$Thl
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $nlo($Hhh),$Thh
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nhi($Hll),$Tll
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nhi($Hlh),$Tlh
-        extru   $Zhh,27,28,$Zhh
-        xor     $rem,$Zhh,$Zhh
-        xor     $Thl,$Zhl,$Zhl
-        ldwx    $nhi($Hhl),$Thl
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $nhi($Hhh),$Thh
-        zdep    $Zll,28,4,$rem
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        extru   $Zhh,27,28,$Zhh
-        xor     $Tll,$Zll,$Zll
-        xor     $Tlh,$Zlh,$Zlh
-        xor     $rem,$Zhh,$Zhh
-        stw     $Zll,12($Xi)
-        xor     $Thl,$Zhl,$Zhl
-        stw     $Zlh,8($Xi)
-        xor     $Thh,$Zhh,$Zhh
-        stw     $Zhl,4($Xi)
-        stw     $Zhh,0($Xi)
-___
-$code.=<<___;
-L\$done_gmult
-        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
-        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
-        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
-        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
-___
-$code.=<<___ if ($SIZE_T==4);
-        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
-        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
-        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
-        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
-        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
-___
-$code.=<<___;
-        bv      (%r2)
-        .EXIT
-        $POPMB  -$FRAME(%sp),%r3
-        .PROCEND
-        .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
-        .ALIGN  64
-gcm_ghash_4bit
-        .PROC
-        .CALLINFO       FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
-        .ENTRY
-        $PUSH   %r2,-$SAVED_RP(%sp)     ; standard prologue
-        $PUSHMA %r3,$FRAME(%sp)
-        $PUSH   %r4,`-$FRAME+1*$SIZE_T`(%sp)
-        $PUSH   %r5,`-$FRAME+2*$SIZE_T`(%sp)
-        $PUSH   %r6,`-$FRAME+3*$SIZE_T`(%sp)
-___
-$code.=<<___ if ($SIZE_T==4);
-        $PUSH   %r7,`-$FRAME+4*$SIZE_T`(%sp)
-        $PUSH   %r8,`-$FRAME+5*$SIZE_T`(%sp)
-        $PUSH   %r9,`-$FRAME+6*$SIZE_T`(%sp)
-        $PUSH   %r10,`-$FRAME+7*$SIZE_T`(%sp)
-        $PUSH   %r11,`-$FRAME+8*$SIZE_T`(%sp)
-___
-$code.=<<___;
-        blr     %r0,$rem_4bit
-        ldi     3,$rem
-L\$pic_ghash
-        andcm   $rem_4bit,$rem,$rem_4bit
-        addl    $inp,$len,$len
-        ldo     L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
-        ldi     0xf0,$mask0xf0
-___
-$code.=<<___ if ($SIZE_T==4);
-#ifndef __OpenBSD__
-        ldi     31,$rem
-        mtctl   $rem,%cr11
-        extrd,u,*= $rem,%sar,1,$rem     ; executes on PA-RISC 1.0
-        b       L\$parisc1_ghash
-        nop
-___
-$code.=<<___;
-        ldb     15($Xi),$nlo
-        ldo     8($Htbl),$Hll
-L\$outer_ghash_pa2
-        ldb     15($inp),$nhi
-        xor     $nhi,$nlo,$nlo
-        and     $mask0xf0,$nlo,$nhi
-        depd,z  $nlo,59,4,$nlo
-        ldd     $nlo($Hll),$Zll
-        ldd     $nlo($Hhh),$Zhh
-        depd,z  $Zll,60,4,$rem
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldb     14($Xi),$nlo
-        ldb     14($inp),$byte
-        ldd     $nhi($Hll),$Tll
-        ldd     $nhi($Hhh),$Thh
-        xor     $byte,$nlo,$nlo
-        and     $mask0xf0,$nlo,$nhi
-        depd,z  $nlo,59,4,$nlo
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        ldd     $rem($rem_4bit),$rem
-        b       L\$oop_ghash_pa2
-        ldi     13,$cnt
-        .ALIGN  8
-L\$oop_ghash_pa2
-        xor     $rem,$Zhh,$Zhh          ; moved here to work around gas bug
-        depd,z  $Zll,60,4,$rem2
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldd     $nlo($Hll),$Tll
-        ldd     $nlo($Hhh),$Thh
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        ldbx    $cnt($Xi),$nlo
-        ldbx    $cnt($inp),$byte
-        depd,z  $Zll,60,4,$rem
-        shrpd   $Zhh,$Zll,4,$Zll
-        ldd     $rem2($rem_4bit),$rem2
-        xor     $rem2,$Zhh,$Zhh
-        xor     $byte,$nlo,$nlo
-        ldd     $nhi($Hll),$Tll
-        ldd     $nhi($Hhh),$Thh
-        and     $mask0xf0,$nlo,$nhi
-        depd,z  $nlo,59,4,$nlo
-        extrd,u $Zhh,59,60,$Zhh
-        xor     $Tll,$Zll,$Zll
-        ldd     $rem($rem_4bit),$rem
-        addib,uv -1,$cnt,L\$oop_ghash_pa2
-        xor     $Thh,$Zhh,$Zhh
-        xor     $rem,$Zhh,$Zhh
-        depd,z  $Zll,60,4,$rem2
-        shrpd   $Zhh,$Zll,4,$Zll
-        extrd,u $Zhh,59,60,$Zhh
-        ldd     $nlo($Hll),$Tll
-        ldd     $nlo($Hhh),$Thh
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        depd,z  $Zll,60,4,$rem
-        shrpd   $Zhh,$Zll,4,$Zll
-        ldd     $rem2($rem_4bit),$rem2
-        xor     $rem2,$Zhh,$Zhh
-        ldd     $nhi($Hll),$Tll
-        ldd     $nhi($Hhh),$Thh
-        extrd,u $Zhh,59,60,$Zhh
-        xor     $Tll,$Zll,$Zll
-        xor     $Thh,$Zhh,$Zhh
-        ldd     $rem($rem_4bit),$rem
-        xor     $rem,$Zhh,$Zhh
-        std     $Zll,8($Xi)
-        ldo     16($inp),$inp
-        std     $Zhh,0($Xi)
-        cmpb,*<> $inp,$len,L\$outer_ghash_pa2
-        copy    $Zll,$nlo
-___
-$code.=<<___ if ($SIZE_T==4);
-        b       L\$done_ghash
-        nop
-L\$parisc1_ghash
-#endif
-        ldb     15($Xi),$nlo
-        ldo     12($Htbl),$Hll
-        ldo     8($Htbl),$Hlh
-        ldo     4($Htbl),$Hhl
-L\$outer_ghash_pa1
-        ldb     15($inp),$byte
-        xor     $byte,$nlo,$nlo
-        and     $mask0xf0,$nlo,$nhi
-        zdep    $nlo,27,4,$nlo
-        ldwx    $nlo($Hll),$Zll
-        ldwx    $nlo($Hlh),$Zlh
-        ldwx    $nlo($Hhl),$Zhl
-        ldwx    $nlo($Hhh),$Zhh
-        zdep    $Zll,28,4,$rem
-        ldb     14($Xi),$nlo
-        ldb     14($inp),$byte
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $nhi($Hll),$Tll
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        ldwx    $nhi($Hlh),$Tlh
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        ldwx    $nhi($Hhl),$Thl
-        extru   $Zhh,27,28,$Zhh
-        ldwx    $nhi($Hhh),$Thh
-        xor     $byte,$nlo,$nlo
-        xor     $rem,$Zhh,$Zhh
-        and     $mask0xf0,$nlo,$nhi
-        zdep    $nlo,27,4,$nlo
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nlo($Hll),$Tll
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nlo($Hlh),$Tlh
-        xor     $Thl,$Zhl,$Zhl
-        b       L\$oop_ghash_pa1
-        ldi     13,$cnt
-        .ALIGN  8
-L\$oop_ghash_pa1
-        zdep    $Zll,28,4,$rem
-        ldwx    $nlo($Hhl),$Thl
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $nlo($Hhh),$Thh
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        ldbx    $cnt($Xi),$nlo
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nhi($Hll),$Tll
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        ldbx    $cnt($inp),$byte
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nhi($Hlh),$Tlh
-        extru   $Zhh,27,28,$Zhh
-        xor     $Thl,$Zhl,$Zhl
-        ldwx    $nhi($Hhl),$Thl
-        xor     $rem,$Zhh,$Zhh
-        zdep    $Zll,28,4,$rem
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $nhi($Hhh),$Thh
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        xor     $byte,$nlo,$nlo
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        and     $mask0xf0,$nlo,$nhi
-        extru   $Zhh,27,28,$Zhh
-        zdep    $nlo,27,4,$nlo
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nlo($Hll),$Tll
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nlo($Hlh),$Tlh
-        xor     $rem,$Zhh,$Zhh
-        addib,uv -1,$cnt,L\$oop_ghash_pa1
-        xor     $Thl,$Zhl,$Zhl
-        zdep    $Zll,28,4,$rem
-        ldwx    $nlo($Hhl),$Thl
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        ldwx    $nlo($Hhh),$Thh
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        xor     $Tll,$Zll,$Zll
-        ldwx    $nhi($Hll),$Tll
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        xor     $Tlh,$Zlh,$Zlh
-        ldwx    $nhi($Hlh),$Tlh
-        extru   $Zhh,27,28,$Zhh
-        xor     $rem,$Zhh,$Zhh
-        xor     $Thl,$Zhl,$Zhl
-        ldwx    $nhi($Hhl),$Thl
-        xor     $Thh,$Zhh,$Zhh
-        ldwx    $nhi($Hhh),$Thh
-        zdep    $Zll,28,4,$rem
-        ldwx    $rem($rem_4bit),$rem
-        shrpw   $Zlh,$Zll,4,$Zll
-        shrpw   $Zhl,$Zlh,4,$Zlh
-        shrpw   $Zhh,$Zhl,4,$Zhl
-        extru   $Zhh,27,28,$Zhh
-        xor     $Tll,$Zll,$Zll
-        xor     $Tlh,$Zlh,$Zlh
-        xor     $rem,$Zhh,$Zhh
-        stw     $Zll,12($Xi)
-        xor     $Thl,$Zhl,$Zhl
-        stw     $Zlh,8($Xi)
-        xor     $Thh,$Zhh,$Zhh
-        stw     $Zhl,4($Xi)
-        ldo     16($inp),$inp
-        stw     $Zhh,0($Xi)
-        comb,<> $inp,$len,L\$outer_ghash_pa1
-        copy    $Zll,$nlo
-___
-$code.=<<___;
-L\$done_ghash
-        $POP    `-$FRAME-$SAVED_RP`(%sp),%r2            ; standard epilogue
-        $POP    `-$FRAME+1*$SIZE_T`(%sp),%r4
-        $POP    `-$FRAME+2*$SIZE_T`(%sp),%r5
-        $POP    `-$FRAME+3*$SIZE_T`(%sp),%r6
-___
-$code.=<<___ if ($SIZE_T==4);
-        $POP    `-$FRAME+4*$SIZE_T`(%sp),%r7
-        $POP    `-$FRAME+5*$SIZE_T`(%sp),%r8
-        $POP    `-$FRAME+6*$SIZE_T`(%sp),%r9
-        $POP    `-$FRAME+7*$SIZE_T`(%sp),%r10
-        $POP    `-$FRAME+8*$SIZE_T`(%sp),%r11
-___
-$code.=<<___;
-        bv      (%r2)
-        .EXIT
-        $POPMB  -$FRAME(%sp),%r3
-        .PROCEND
-        .ALIGN  64
-L\$rem_4bit
-        .WORD   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
-        .WORD   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
-        .WORD   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
-        .WORD   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
-        .data
-        .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
-        .ALIGN  64
-___
-# Explicitly encode PA-RISC 2.0 instructions used in this module, so
-# that it can be compiled with .LEVEL 1.0. It should be noted that I
-# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
-# directive...
-my $ldd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "ldd$mod\t$args";
-    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)         # format 4
-    {   my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
-        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)     # format 5
-    {   my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
-        $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);                # encode offset
-        $opcode|=(1<<5)  if ($mod =~ /^,m/);
-        $opcode|=(1<<13) if ($mod =~ /^,mb/);
-        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-my $std = sub {
-  my ($mod,$args) = @_;
-  my $orig = "std$mod\t$args";
-    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
-    {   my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
-        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-my $extrd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "extrd$mod\t$args";
-    # I only have ",u" completer, it's implicitly encoded...
-    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 15
-    {   my $opcode=(0x36<<26)|($1<<21)|($4<<16);
-        my $len=32-$3;
-        $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);               # encode pos
-        $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
-        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)      # format 12
-    {   my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
-        my $len=32-$2;
-        $opcode |= (($len&0x20)<<3)|($len&0x1f);                # encode len
-        $opcode |= (1<<13) if ($mod =~ /,\**=/);
-        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-my $shrpd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "shrpd$mod\t$args";
-    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)   # format 14
-    {   my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
-        my $cpos=63-$3;
-        $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode sa
-        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)    # format 11
-    {   sprintf "\t.WORD\t0x%08x\t; %s",
-                (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
-    }
-    else { "\t".$orig; }
-};
-my $depd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "depd$mod\t$args";
-    # I only have ",z" completer, it's implicitly encoded...
-    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)     # format 16
-    {   my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
-        my $cpos=63-$2;
-        my $len=32-$3;
-        $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);         # encode pos
-        $opcode |= (($len&0x20)<<7)|($len&0x1f);                # encode len
-        sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-sub assemble {
-  my ($mnemonic,$mod,$args)=@_;
-  my $opcode = eval("\$$mnemonic");
-    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
-}
-foreach (split("\n",$code)) {
-        s/\`([^\`]*)\`/eval $1/ge;
-        if ($SIZE_T==4) {
-                s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
-                s/cmpb,\*/comb,/;
-                s/,\*/,/;
-        }
-        s/\bbv\b/bve/   if ($SIZE_T==8);
-        print $_,"\n";
-}
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
deleted file mode 100644
index 6a40d5d89c..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-s390x.pl
+++ /dev/null
@@ -1,262 +0,0 @@
-#!/usr/bin/env perl
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-# September 2010.
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Performance
-# was measured to be ~18 cycles per processed byte on z10, which is
-# almost 40% better than gcc-generated code. It should be noted that
-# 18 cycles is worse result than expected: loop is scheduled for 12
-# and the result should be close to 12. In the lack of instruction-
-# level profiling data it's impossible to tell why...
-# November 2010.
-#
-# Adapt for -m31 build. If kernel supports what's called "highgprs"
-# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
-# instructions and achieve "64-bit" performance even in 31-bit legacy
-# application context. The feature is not specific to any particular
-# processor, as long as it's "z-CPU". Latter implies that the code
-# remains z/Architecture specific. On z990 it was measured to perform
-# 2.8x better than 32-bit code generated by gcc 4.3.
-# March 2011.
-#
-# Support for hardware KIMD-GHASH is verified to produce correct
-# result and therefore is engaged. On z196 it was measured to process
-# 8KB buffer ~7 faster than software implementation. It's not as
-# impressive for smaller buffer sizes and for smallest 16-bytes buffer
-# it's actually almost 2 times slower. Which is the reason why
-# KIMD-GHASH is not used in gcm_gmult_4bit.
-$flavour = shift;
-if ($flavour =~ /3[12]/) {
-        $SIZE_T=4;
-        $g="";
-} else {
-        $SIZE_T=8;
-        $g="g";
-}
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-$softonly=0;
-$Zhi="%r0";
-$Zlo="%r1";
-$Xi="%r2";      # argument block
-$Htbl="%r3";
-$inp="%r4";
-$len="%r5";
-$rem0="%r6";    # variables
-$rem1="%r7";
-$nlo="%r8";
-$nhi="%r9";
-$xi="%r10";
-$cnt="%r11";
-$tmp="%r12";
-$x78="%r13";
-$rem_4bit="%r14";
-$sp="%r15";
-$code.=<<___;
-.text
-.globl  gcm_gmult_4bit
-.align  32
-gcm_gmult_4bit:
-___
-$code.=<<___ if(!$softonly && 0);       # hardware is slow for single block...
-        larl    %r1,OPENSSL_s390xcap_P
-        lg      %r0,0(%r1)
-        tmhl    %r0,0x4000      # check for message-security-assist
-        jz      .Lsoft_gmult
-        lghi    %r0,0
-        la      %r1,16($sp)
-        .long   0xb93e0004      # kimd %r0,%r4
-        lg      %r1,24($sp)
-        tmhh    %r1,0x4000      # check for function 65
-        jz      .Lsoft_gmult
-        stg     %r0,16($sp)     # arrange 16 bytes of zero input
-        stg     %r0,24($sp)
-        lghi    %r0,65          # function 65
-        la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
-        la      $inp,16($sp)
-        lghi    $len,16
-        .long   0xb93e0004      # kimd %r0,$inp
-        brc     1,.-4           # pay attention to "partial completion"
-        br      %r14
-.align  32
-.Lsoft_gmult:
-___
-$code.=<<___;
-        stm${g} %r6,%r14,6*$SIZE_T($sp)
-        aghi    $Xi,-1
-        lghi    $len,1
-        lghi    $x78,`0xf<<3`
-        larl    $rem_4bit,rem_4bit
-        lg      $Zlo,8+1($Xi)           # Xi
-        j       .Lgmult_shortcut
-.type   gcm_gmult_4bit,\@function
-.size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.globl  gcm_ghash_4bit
-.align  32
-gcm_ghash_4bit:
-___
-$code.=<<___ if(!$softonly);
-        larl    %r1,OPENSSL_s390xcap_P
-        lg      %r0,0(%r1)
-        tmhl    %r0,0x4000      # check for message-security-assist
-        jz      .Lsoft_ghash
-        lghi    %r0,0
-        la      %r1,16($sp)
-        .long   0xb93e0004      # kimd %r0,%r4
-        lg      %r1,24($sp)
-        tmhh    %r1,0x4000      # check for function 65
-        jz      .Lsoft_ghash
-        lghi    %r0,65          # function 65
-        la      %r1,0($Xi)      # H lies right after Xi in gcm128_context
-        .long   0xb93e0004      # kimd %r0,$inp
-        brc     1,.-4           # pay attention to "partial completion"
-        br      %r14
-.align  32
-.Lsoft_ghash:
-___
-$code.=<<___ if ($flavour =~ /3[12]/);
-        llgfr   $len,$len
-___
-$code.=<<___;
-        stm${g} %r6,%r14,6*$SIZE_T($sp)
-        aghi    $Xi,-1
-        srlg    $len,$len,4
-        lghi    $x78,`0xf<<3`
-        larl    $rem_4bit,rem_4bit
-        lg      $Zlo,8+1($Xi)           # Xi
-        lg      $Zhi,0+1($Xi)
-        lghi    $tmp,0
-.Louter:
-        xg      $Zhi,0($inp)            # Xi ^= inp 
-        xg      $Zlo,8($inp)
-        xgr     $Zhi,$tmp
-        stg     $Zlo,8+1($Xi)
-        stg     $Zhi,0+1($Xi)
-.Lgmult_shortcut:
-        lghi    $tmp,0xf0
-        sllg    $nlo,$Zlo,4
-        srlg    $xi,$Zlo,8              # extract second byte
-        ngr     $nlo,$tmp
-        lgr     $nhi,$Zlo
-        lghi    $cnt,14
-        ngr     $nhi,$tmp
-        lg      $Zlo,8($nlo,$Htbl)
-        lg      $Zhi,0($nlo,$Htbl)
-        sllg    $nlo,$xi,4
-        sllg    $rem0,$Zlo,3
-        ngr     $nlo,$tmp
-        ngr     $rem0,$x78
-        ngr     $xi,$tmp
-        sllg    $tmp,$Zhi,60
-        srlg    $Zlo,$Zlo,4
-        srlg    $Zhi,$Zhi,4
-        xg      $Zlo,8($nhi,$Htbl)
-        xg      $Zhi,0($nhi,$Htbl)
-        lgr     $nhi,$xi
-        sllg    $rem1,$Zlo,3
-        xgr     $Zlo,$tmp
-        ngr     $rem1,$x78
-        j       .Lghash_inner
-.align  16
-.Lghash_inner:
-        srlg    $Zlo,$Zlo,4
-        sllg    $tmp,$Zhi,60
-        xg      $Zlo,8($nlo,$Htbl)
-        srlg    $Zhi,$Zhi,4
-        llgc    $xi,0($cnt,$Xi)
-        xg      $Zhi,0($nlo,$Htbl)
-        sllg    $nlo,$xi,4
-        xg      $Zhi,0($rem0,$rem_4bit)
-        nill    $nlo,0xf0
-        sllg    $rem0,$Zlo,3
-        xgr     $Zlo,$tmp
-        ngr     $rem0,$x78
-        nill    $xi,0xf0
-        sllg    $tmp,$Zhi,60
-        srlg    $Zlo,$Zlo,4
-        srlg    $Zhi,$Zhi,4
-        xg      $Zlo,8($nhi,$Htbl)
-        xg      $Zhi,0($nhi,$Htbl)
-        lgr     $nhi,$xi
-        xg      $Zhi,0($rem1,$rem_4bit)
-        sllg    $rem1,$Zlo,3
-        xgr     $Zlo,$tmp
-        ngr     $rem1,$x78
-        brct    $cnt,.Lghash_inner
-        sllg    $tmp,$Zhi,60
-        srlg    $Zlo,$Zlo,4
-        srlg    $Zhi,$Zhi,4
-        xg      $Zlo,8($nlo,$Htbl)
-        xg      $Zhi,0($nlo,$Htbl)
-        sllg    $xi,$Zlo,3
-        xg      $Zhi,0($rem0,$rem_4bit)
-        xgr     $Zlo,$tmp
-        ngr     $xi,$x78
-        sllg    $tmp,$Zhi,60
-        srlg    $Zlo,$Zlo,4
-        srlg    $Zhi,$Zhi,4
-        xg      $Zlo,8($nhi,$Htbl)
-        xg      $Zhi,0($nhi,$Htbl)
-        xgr     $Zlo,$tmp
-        xg      $Zhi,0($rem1,$rem_4bit)
-        lg      $tmp,0($xi,$rem_4bit)
-        la      $inp,16($inp)
-        sllg    $tmp,$tmp,4             # correct last rem_4bit[rem]
-        brctg   $len,.Louter
-        xgr     $Zhi,$tmp
-        stg     $Zlo,8+1($Xi)
-        stg     $Zhi,0+1($Xi)
-        lm${g}  %r6,%r14,6*$SIZE_T($sp)
-        br      %r14
-.type   gcm_ghash_4bit,\@function
-.size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
-.align  64
-rem_4bit:
-        .long   `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
-        .long   `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
-        .long   `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
-        .long   `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
-.type   rem_4bit,\@object
-.size   rem_4bit,(.-rem_4bit)
-.string "GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
deleted file mode 100644
index 70e7b044a3..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
+++ /dev/null
@@ -1,330 +0,0 @@
-#!/usr/bin/env perl
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Performance
-# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
-# and are expressed in cycles per processed byte, less is better:
-#
-#               gcc 3.3.x       cc 5.2          this assembler
-#
-# 32-bit build  81.4            43.3            12.6    (+546%/+244%)
-# 64-bit build  20.2            21.2            12.6    (+60%/+68%)
-#
-# Here is data collected on UltraSPARC T1 system running Linux:
-#
-#               gcc 4.4.1                       this assembler
-#
-# 32-bit build  566                             50      (+1000%)
-# 64-bit build  56                              50      (+12%)
-#
-# I don't quite understand why difference between 32-bit and 64-bit
-# compiler-generated code is so big. Compilers *were* instructed to
-# generate code for UltraSPARC and should have used 64-bit registers
-# for Z vector (see C code) even in 32-bit build... Oh well, it only
-# means more impressive improvement coefficients for this assembler
-# module;-) Loops are aggressively modulo-scheduled in respect to
-# references to input data and Z.hi updates to achieve 12 cycles
-# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
-# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
-$bits=32;
-for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)  { $bias=2047; $frame=192; }
-else            { $bias=0;    $frame=112; }
-$output=shift;
-open STDOUT,">$output";
-$Zhi="%o0";     # 64-bit values
-$Zlo="%o1";
-$Thi="%o2";
-$Tlo="%o3";
-$rem="%o4";
-$tmp="%o5";
-$nhi="%l0";     # small values and pointers
-$nlo="%l1";
-$xi0="%l2";
-$xi1="%l3";
-$rem_4bit="%l4";
-$remi="%l5";
-$Htblo="%l6";
-$cnt="%l7";
-$Xi="%i0";      # input argument block
-$Htbl="%i1";
-$inp="%i2";
-$len="%i3";
-$code.=<<___;
-.section        ".text",#alloc,#execinstr
-.align  64
-rem_4bit:
-        .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
-        .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
-        .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
-        .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
-.type   rem_4bit,#object
-.size   rem_4bit,(.-rem_4bit)
-.globl  gcm_ghash_4bit
-.align  32
-gcm_ghash_4bit:
-        save    %sp,-$frame,%sp
-        ldub    [$inp+15],$nlo
-        ldub    [$Xi+15],$xi0
-        ldub    [$Xi+14],$xi1
-        add     $len,$inp,$len
-        add     $Htbl,8,$Htblo
-1:      call    .+8
-        add     %o7,rem_4bit-1b,$rem_4bit
-.Louter:
-        xor     $xi0,$nlo,$nlo
-        and     $nlo,0xf0,$nhi
-        and     $nlo,0x0f,$nlo
-        sll     $nlo,4,$nlo
-        ldx     [$Htblo+$nlo],$Zlo
-        ldx     [$Htbl+$nlo],$Zhi
-        ldub    [$inp+14],$nlo
-        ldx     [$Htblo+$nhi],$Tlo
-        and     $Zlo,0xf,$remi
-        ldx     [$Htbl+$nhi],$Thi
-        sll     $remi,3,$remi
-        ldx     [$rem_4bit+$remi],$rem
-        srlx    $Zlo,4,$Zlo
-        mov     13,$cnt
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        xor     $xi1,$nlo,$nlo
-        and     $Zlo,0xf,$remi
-        and     $nlo,0xf0,$nhi
-        and     $nlo,0x0f,$nlo
-        ba      .Lghash_inner
-        sll     $nlo,4,$nlo
-.align  32
-.Lghash_inner:
-        ldx     [$Htblo+$nlo],$Tlo
-        sll     $remi,3,$remi
-        xor     $Thi,$Zhi,$Zhi
-        ldx     [$Htbl+$nlo],$Thi
-        srlx    $Zlo,4,$Zlo
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        ldub    [$inp+$cnt],$nlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        ldub    [$Xi+$cnt],$xi1
-        xor     $Thi,$Zhi,$Zhi
-        and     $Zlo,0xf,$remi
-        ldx     [$Htblo+$nhi],$Tlo
-        sll     $remi,3,$remi
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$Htbl+$nhi],$Thi
-        srlx    $Zlo,4,$Zlo
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $xi1,$nlo,$nlo
-        srlx    $Zhi,4,$Zhi
-        and     $nlo,0xf0,$nhi
-        addcc   $cnt,-1,$cnt
-        xor     $Zlo,$tmp,$Zlo
-        and     $nlo,0x0f,$nlo
-        xor     $Tlo,$Zlo,$Zlo
-        sll     $nlo,4,$nlo
-        blu     .Lghash_inner
-        and     $Zlo,0xf,$remi
-        ldx     [$Htblo+$nlo],$Tlo
-        sll     $remi,3,$remi
-        xor     $Thi,$Zhi,$Zhi
-        ldx     [$Htbl+$nlo],$Thi
-        srlx    $Zlo,4,$Zlo
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        xor     $Thi,$Zhi,$Zhi
-        add     $inp,16,$inp
-        cmp     $inp,$len
-        be,pn   `$bits==64?"%xcc":"%icc"`,.Ldone
-        and     $Zlo,0xf,$remi
-        ldx     [$Htblo+$nhi],$Tlo
-        sll     $remi,3,$remi
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$Htbl+$nhi],$Thi
-        srlx    $Zlo,4,$Zlo
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        ldub    [$inp+15],$nlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        xor     $Thi,$Zhi,$Zhi
-        stx     $Zlo,[$Xi+8]
-        xor     $rem,$Zhi,$Zhi
-        stx     $Zhi,[$Xi]
-        srl     $Zlo,8,$xi1
-        and     $Zlo,0xff,$xi0
-        ba      .Louter
-        and     $xi1,0xff,$xi1
-.align  32
-.Ldone:
-        ldx     [$Htblo+$nhi],$Tlo
-        sll     $remi,3,$remi
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$Htbl+$nhi],$Thi
-        srlx    $Zlo,4,$Zlo
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        xor     $Thi,$Zhi,$Zhi
-        stx     $Zlo,[$Xi+8]
-        xor     $rem,$Zhi,$Zhi
-        stx     $Zhi,[$Xi]
-        ret
-        restore
-.type   gcm_ghash_4bit,#function
-.size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
-___
-undef $inp;
-undef $len;
-$code.=<<___;
-.globl  gcm_gmult_4bit
-.align  32
-gcm_gmult_4bit:
-        save    %sp,-$frame,%sp
-        ldub    [$Xi+15],$nlo
-        add     $Htbl,8,$Htblo
-1:      call    .+8
-        add     %o7,rem_4bit-1b,$rem_4bit
-        and     $nlo,0xf0,$nhi
-        and     $nlo,0x0f,$nlo
-        sll     $nlo,4,$nlo
-        ldx     [$Htblo+$nlo],$Zlo
-        ldx     [$Htbl+$nlo],$Zhi
-        ldub    [$Xi+14],$nlo
-        ldx     [$Htblo+$nhi],$Tlo
-        and     $Zlo,0xf,$remi
-        ldx     [$Htbl+$nhi],$Thi
-        sll     $remi,3,$remi
-        ldx     [$rem_4bit+$remi],$rem
-        srlx    $Zlo,4,$Zlo
-        mov     13,$cnt
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        and     $Zlo,0xf,$remi
-        and     $nlo,0xf0,$nhi
-        and     $nlo,0x0f,$nlo
-        ba      .Lgmult_inner
-        sll     $nlo,4,$nlo
-.align  32
-.Lgmult_inner:
-        ldx     [$Htblo+$nlo],$Tlo
-        sll     $remi,3,$remi
-        xor     $Thi,$Zhi,$Zhi
-        ldx     [$Htbl+$nlo],$Thi
-        srlx    $Zlo,4,$Zlo
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        ldub    [$Xi+$cnt],$nlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        xor     $Thi,$Zhi,$Zhi
-        and     $Zlo,0xf,$remi
-        ldx     [$Htblo+$nhi],$Tlo
-        sll     $remi,3,$remi
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$Htbl+$nhi],$Thi
-        srlx    $Zlo,4,$Zlo
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        srlx    $Zhi,4,$Zhi
-        and     $nlo,0xf0,$nhi
-        addcc   $cnt,-1,$cnt
-        xor     $Zlo,$tmp,$Zlo
-        and     $nlo,0x0f,$nlo
-        xor     $Tlo,$Zlo,$Zlo
-        sll     $nlo,4,$nlo
-        blu     .Lgmult_inner
-        and     $Zlo,0xf,$remi
-        ldx     [$Htblo+$nlo],$Tlo
-        sll     $remi,3,$remi
-        xor     $Thi,$Zhi,$Zhi
-        ldx     [$Htbl+$nlo],$Thi
-        srlx    $Zlo,4,$Zlo
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        xor     $Thi,$Zhi,$Zhi
-        and     $Zlo,0xf,$remi
-        ldx     [$Htblo+$nhi],$Tlo
-        sll     $remi,3,$remi
-        xor     $rem,$Zhi,$Zhi
-        ldx     [$Htbl+$nhi],$Thi
-        srlx    $Zlo,4,$Zlo
-        ldx     [$rem_4bit+$remi],$rem
-        sllx    $Zhi,60,$tmp
-        xor     $Tlo,$Zlo,$Zlo
-        srlx    $Zhi,4,$Zhi
-        xor     $Zlo,$tmp,$Zlo
-        xor     $Thi,$Zhi,$Zhi
-        stx     $Zlo,[$Xi+8]
-        xor     $rem,$Zhi,$Zhi
-        stx     $Zhi,[$Xi]
-        ret
-        restore
-.type   gcm_gmult_4bit,#function
-.size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
-.asciz  "GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
-.align  4
-___
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
deleted file mode 100644
index 83c727e07f..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86.pl
+++ /dev/null
@@ -1,1342 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March, May, June 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
-# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
-# of per-key storage [+512 bytes shared table]. Performance results
-# are for streamed GHASH subroutine and are expressed in cycles per
-# processed byte, less is better:
-#
-#               gcc 2.95.3(*)   MMX assembler   x86 assembler
-#
-# Pentium       105/111(**)     -               50
-# PIII          68 /75          12.2            24
-# P4            125/125         17.8            84(***)
-# Opteron       66 /70          10.1            30
-# Core2         54 /67          8.4             18
-#
-# (*)   gcc 3.4.x was observed to generate few percent slower code,
-#       which is one of reasons why 2.95.3 results were chosen,
-#       another reason is lack of 3.4.x results for older CPUs;
-#       comparison with MMX results is not completely fair, because C
-#       results are for vanilla "256B" implementation, while
-#       assembler results are for "528B";-)
-# (**)  second number is result for code compiled with -fPIC flag,
-#       which is actually more relevant, because assembler code is
-#       position-independent;
-# (***) see comment in non-MMX routine for further details;
-#
-# To summarize, it's >2-5 times faster than gcc-generated code. To
-# anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
-# May 2010
-#
-# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
-# The question is how close is it to theoretical limit? The pclmulqdq
-# instruction latency appears to be 14 cycles and there can't be more
-# than 2 of them executing at any given time. This means that single
-# Karatsuba multiplication would take 28 cycles *plus* few cycles for
-# pre- and post-processing. Then multiplication has to be followed by
-# modulo-reduction. Given that aggregated reduction method [see
-# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
-# white paper by Intel] allows you to perform reduction only once in
-# a while we can assume that asymptotic performance can be estimated
-# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
-# and Naggr is the aggregation factor.
-#
-# Before we proceed to this implementation let's have closer look at
-# the best-performing code suggested by Intel in their white paper.
-# By tracing inter-register dependencies Tmod is estimated as ~19
-# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
-# processed byte. As implied, this is quite optimistic estimate,
-# because it does not account for Karatsuba pre- and post-processing,
-# which for a single multiplication is ~5 cycles. Unfortunately Intel
-# does not provide performance data for GHASH alone. But benchmarking
-# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
-# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
-# the result accounts even for pre-computing of degrees of the hash
-# key H, but its portion is negligible at 16KB buffer size.
-#
-# Moving on to the implementation in question. Tmod is estimated as
-# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
-# 2.16. How is it possible that measured performance is better than
-# optimistic theoretical estimate? There is one thing Intel failed
-# to recognize. By serializing GHASH with CTR in same subroutine
-# former's performance is really limited to above (Tmul + Tmod/Naggr)
-# equation. But if GHASH procedure is detached, the modulo-reduction
-# can be interleaved with Naggr-1 multiplications at instruction level
-# and under ideal conditions even disappear from the equation. So that
-# optimistic theoretical estimate for this implementation is ...
-# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
-# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
-# where Tproc is time required for Karatsuba pre- and post-processing,
-# is more realistic estimate. In this case it gives ... 1.91 cycles.
-# Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be betwen
-# 1.91 and 2.16. As already mentioned, this implementation processes
-# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
-# - in 2.02. x86_64 performance is better, because larger register
-# bank allows to interleave reduction and multiplication better.
-#
-# Does it make sense to increase Naggr? To start with it's virtually
-# impossible in 32-bit mode, because of limited register bank
-# capacity. Otherwise improvement has to be weighed agiainst slower
-# setup, as well as code size and complexity increase. As even
-# optimistic estimate doesn't promise 30% performance improvement,
-# there are currently no plans to increase Naggr.
-#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
-# January 2010
-#
-# Tweaked to optimize transitions between integer and FP operations
-# on same XMM register, PCLMULQDQ subroutine was measured to process
-# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
-# The minor regression on Westmere is outweighed by ~15% improvement
-# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
-# similar manner resulted in almost 20% degradation on Sandy Bridge,
-# where original 64-bit code processes one byte in 1.95 cycles.
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
-$sse2=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
-$inp  = "edi";
-$Htbl = "esi";
-$unroll = 0;    # Affects x86 loop. Folded loop performs ~7% worse
-                # than unrolled, which has to be weighted against
-                # 2.5x x86-specific code size reduction.
-sub x86_loop {
-    my $off = shift;
-    my $rem = "eax";
-        &mov    ($Zhh,&DWP(4,$Htbl,$Zll));
-        &mov    ($Zhl,&DWP(0,$Htbl,$Zll));
-        &mov    ($Zlh,&DWP(12,$Htbl,$Zll));
-        &mov    ($Zll,&DWP(8,$Htbl,$Zll));
-        &xor    ($rem,$rem);    # avoid partial register stalls on PIII
-        # shrd practically kills P4, 2.5x deterioration, but P4 has
-        # MMX code-path to execute. shrd runs tad faster [than twice
-        # the shifts, move's and or's] on pre-MMX Pentium (as well as
-        # PIII and Core2), *but* minimizes code size, spares register
-        # and thus allows to fold the loop...
-        if (!$unroll) {
-        my $cnt = $inp;
-        &mov    ($cnt,15);
-        &jmp    (&label("x86_loop"));
-        &set_label("x86_loop",16);
-            for($i=1;$i<=2;$i++) {
-                &mov    (&LB($rem),&LB($Zll));
-                &shrd   ($Zll,$Zlh,4);
-                &and    (&LB($rem),0xf);
-                &shrd   ($Zlh,$Zhl,4);
-                &shrd   ($Zhl,$Zhh,4);
-                &shr    ($Zhh,4);
-                &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
-                &mov    (&LB($rem),&BP($off,"esp",$cnt));
-                if ($i&1) {
-                        &and    (&LB($rem),0xf0);
-                } else {
-                        &shl    (&LB($rem),4);
-                }
-                &xor    ($Zll,&DWP(8,$Htbl,$rem));
-                &xor    ($Zlh,&DWP(12,$Htbl,$rem));
-                &xor    ($Zhl,&DWP(0,$Htbl,$rem));
-                &xor    ($Zhh,&DWP(4,$Htbl,$rem));
-                if ($i&1) {
-                        &dec    ($cnt);
-                        &js     (&label("x86_break"));
-                } else {
-                        &jmp    (&label("x86_loop"));
-                }
-            }
-        &set_label("x86_break",16);
-        } else {
-            for($i=1;$i<32;$i++) {
-                &comment($i);
-                &mov    (&LB($rem),&LB($Zll));
-                &shrd   ($Zll,$Zlh,4);
-                &and    (&LB($rem),0xf);
-                &shrd   ($Zlh,$Zhl,4);
-                &shrd   ($Zhl,$Zhh,4);
-                &shr    ($Zhh,4);
-                &xor    ($Zhh,&DWP($off+16,"esp",$rem,4));
-                if ($i&1) {
-                        &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
-                        &and    (&LB($rem),0xf0);
-                } else {
-                        &mov    (&LB($rem),&BP($off+15-($i>>1),"esp"));
-                        &shl    (&LB($rem),4);
-                }
-                &xor    ($Zll,&DWP(8,$Htbl,$rem));
-                &xor    ($Zlh,&DWP(12,$Htbl,$rem));
-                &xor    ($Zhl,&DWP(0,$Htbl,$rem));
-                &xor    ($Zhh,&DWP(4,$Htbl,$rem));
-            }
-        }
-        &bswap  ($Zll);
-        &bswap  ($Zlh);
-        &bswap  ($Zhl);
-        if (!$x86only) {
-                &bswap  ($Zhh);
-        } else {
-                &mov    ("eax",$Zhh);
-                &bswap  ("eax");
-                &mov    ($Zhh,"eax");
-        }
-}
-if ($unroll) {
-    &function_begin_B("_x86_gmult_4bit_inner");
-        &x86_loop(4);
-        &ret    ();
-    &function_end_B("_x86_gmult_4bit_inner");
-}
-sub deposit_rem_4bit {
-    my $bias = shift;
-        &mov    (&DWP($bias+0, "esp"),0x0000<<16);
-        &mov    (&DWP($bias+4, "esp"),0x1C20<<16);
-        &mov    (&DWP($bias+8, "esp"),0x3840<<16);
-        &mov    (&DWP($bias+12,"esp"),0x2460<<16);
-        &mov    (&DWP($bias+16,"esp"),0x7080<<16);
-        &mov    (&DWP($bias+20,"esp"),0x6CA0<<16);
-        &mov    (&DWP($bias+24,"esp"),0x48C0<<16);
-        &mov    (&DWP($bias+28,"esp"),0x54E0<<16);
-        &mov    (&DWP($bias+32,"esp"),0xE100<<16);
-        &mov    (&DWP($bias+36,"esp"),0xFD20<<16);
-        &mov    (&DWP($bias+40,"esp"),0xD940<<16);
-        &mov    (&DWP($bias+44,"esp"),0xC560<<16);
-        &mov    (&DWP($bias+48,"esp"),0x9180<<16);
-        &mov    (&DWP($bias+52,"esp"),0x8DA0<<16);
-        &mov    (&DWP($bias+56,"esp"),0xA9C0<<16);
-        &mov    (&DWP($bias+60,"esp"),0xB5E0<<16);
-}
-$suffix = $x86only ? "" : "_x86";
-&function_begin("gcm_gmult_4bit".$suffix);
-        &stack_push(16+4+1);                    # +1 for stack alignment
-        &mov    ($inp,&wparam(0));              # load Xi
-        &mov    ($Htbl,&wparam(1));             # load Htable
-        &mov    ($Zhh,&DWP(0,$inp));            # load Xi[16]
-        &mov    ($Zhl,&DWP(4,$inp));
-        &mov    ($Zlh,&DWP(8,$inp));
-        &mov    ($Zll,&DWP(12,$inp));
-        &deposit_rem_4bit(16);
-        &mov    (&DWP(0,"esp"),$Zhh);           # copy Xi[16] on stack
-        &mov    (&DWP(4,"esp"),$Zhl);
-        &mov    (&DWP(8,"esp"),$Zlh);
-        &mov    (&DWP(12,"esp"),$Zll);
-        &shr    ($Zll,20);
-        &and    ($Zll,0xf0);
-        if ($unroll) {
-                &call   ("_x86_gmult_4bit_inner");
-        } else {
-                &x86_loop(0);
-                &mov    ($inp,&wparam(0));
-        }
-        &mov    (&DWP(12,$inp),$Zll);
-        &mov    (&DWP(8,$inp),$Zlh);
-        &mov    (&DWP(4,$inp),$Zhl);
-        &mov    (&DWP(0,$inp),$Zhh);
-        &stack_pop(16+4+1);
-&function_end("gcm_gmult_4bit".$suffix);
-&function_begin("gcm_ghash_4bit".$suffix);
-        &stack_push(16+4+1);                    # +1 for 64-bit alignment
-        &mov    ($Zll,&wparam(0));              # load Xi
-        &mov    ($Htbl,&wparam(1));             # load Htable
-        &mov    ($inp,&wparam(2));              # load in
-        &mov    ("ecx",&wparam(3));             # load len
-        &add    ("ecx",$inp);
-        &mov    (&wparam(3),"ecx");
-        &mov    ($Zhh,&DWP(0,$Zll));            # load Xi[16]
-        &mov    ($Zhl,&DWP(4,$Zll));
-        &mov    ($Zlh,&DWP(8,$Zll));
-        &mov    ($Zll,&DWP(12,$Zll));
-        &deposit_rem_4bit(16);
-    &set_label("x86_outer_loop",16);
-        &xor    ($Zll,&DWP(12,$inp));           # xor with input
-        &xor    ($Zlh,&DWP(8,$inp));
-        &xor    ($Zhl,&DWP(4,$inp));
-        &xor    ($Zhh,&DWP(0,$inp));
-        &mov    (&DWP(12,"esp"),$Zll);          # dump it on stack
-        &mov    (&DWP(8,"esp"),$Zlh);
-        &mov    (&DWP(4,"esp"),$Zhl);
-        &mov    (&DWP(0,"esp"),$Zhh);
-        &shr    ($Zll,20);
-        &and    ($Zll,0xf0);
-        if ($unroll) {
-                &call   ("_x86_gmult_4bit_inner");
-        } else {
-                &x86_loop(0);
-                &mov    ($inp,&wparam(2));
-        }
-        &lea    ($inp,&DWP(16,$inp));
-        &cmp    ($inp,&wparam(3));
-        &mov    (&wparam(2),$inp)       if (!$unroll);
-        &jb     (&label("x86_outer_loop"));
-        &mov    ($inp,&wparam(0));      # load Xi
-        &mov    (&DWP(12,$inp),$Zll);
-        &mov    (&DWP(8,$inp),$Zlh);
-        &mov    (&DWP(4,$inp),$Zhl);
-        &mov    (&DWP(0,$inp),$Zhh);
-        &stack_pop(16+4+1);
-&function_end("gcm_ghash_4bit".$suffix);
-if (!$x86only) {{{
-&static_label("rem_4bit");
-if (!$sse2) {{  # pure-MMX "May" version...
-$S=12;          # shift factor for rem_4bit
-&function_begin_B("_mmx_gmult_4bit_inner");
-# MMX version performs 3.5 times better on P4 (see comment in non-MMX
-# routine for further details), 100% better on Opteron, ~70% better
-# on Core2 and PIII... In other words effort is considered to be well
-# spent... Since initial release the loop was unrolled in order to
-# "liberate" register previously used as loop counter. Instead it's
-# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
-# The path involves move of Z.lo from MMX to integer register,
-# effective address calculation and finally merge of value to Z.hi.
-# Reference to rem_4bit is scheduled so late that I had to >>4
-# rem_4bit elements. This resulted in 20-45% procent improvement
-# on contemporary �-archs.
-{
-    my $cnt;
-    my $rem_4bit = "eax";
-    my @rem = ($Zhh,$Zll);
-    my $nhi = $Zhl;
-    my $nlo = $Zlh;
-    my ($Zlo,$Zhi) = ("mm0","mm1");
-    my $tmp = "mm2";
-        &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
-        &mov    ($nhi,$Zll);
-        &mov    (&LB($nlo),&LB($nhi));
-        &shl    (&LB($nlo),4);
-        &and    ($nhi,0xf0);
-        &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
-        &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
-        &movd   ($rem[0],$Zlo);
-        for ($cnt=28;$cnt>=-2;$cnt--) {
-            my $odd = $cnt&1;
-            my $nix = $odd ? $nlo : $nhi;
-                &shl    (&LB($nlo),4)                   if ($odd);
-                &psrlq  ($Zlo,4);
-                &movq   ($tmp,$Zhi);
-                &psrlq  ($Zhi,4);
-                &pxor   ($Zlo,&QWP(8,$Htbl,$nix));
-                &mov    (&LB($nlo),&BP($cnt/2,$inp))    if (!$odd && $cnt>=0);
-                &psllq  ($tmp,60);
-                &and    ($nhi,0xf0)                     if ($odd);
-                &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
-                &and    ($rem[0],0xf);
-                &pxor   ($Zhi,&QWP(0,$Htbl,$nix));
-                &mov    ($nhi,$nlo)                     if (!$odd && $cnt>=0);
-                &movd   ($rem[1],$Zlo);
-                &pxor   ($Zlo,$tmp);
-                push    (@rem,shift(@rem));             # "rotate" registers
-        }
-        &mov    ($inp,&DWP(4,$rem_4bit,$rem[1],8));     # last rem_4bit[rem]
-        &psrlq  ($Zlo,32);      # lower part of Zlo is already there
-        &movd   ($Zhl,$Zhi);
-        &psrlq  ($Zhi,32);
-        &movd   ($Zlh,$Zlo);
-        &movd   ($Zhh,$Zhi);
-        &shl    ($inp,4);       # compensate for rem_4bit[i] being >>4
-        &bswap  ($Zll);
-        &bswap  ($Zhl);
-        &bswap  ($Zlh);
-        &xor    ($Zhh,$inp);
-        &bswap  ($Zhh);
-        &ret    ();
-}
-&function_end_B("_mmx_gmult_4bit_inner");
-&function_begin("gcm_gmult_4bit_mmx");
-        &mov    ($inp,&wparam(0));      # load Xi
-        &mov    ($Htbl,&wparam(1));     # load Htable
-        &call   (&label("pic_point"));
-        &set_label("pic_point");
-        &blindpop("eax");
-        &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-        &movz   ($Zll,&BP(15,$inp));
-        &call   ("_mmx_gmult_4bit_inner");
-        &mov    ($inp,&wparam(0));      # load Xi
-        &emms   ();
-        &mov    (&DWP(12,$inp),$Zll);
-        &mov    (&DWP(4,$inp),$Zhl);
-        &mov    (&DWP(8,$inp),$Zlh);
-        &mov    (&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-# Streamed version performs 20% better on P4, 7% on Opteron,
-# 10% on Core2 and PIII...
-&function_begin("gcm_ghash_4bit_mmx");
-        &mov    ($Zhh,&wparam(0));      # load Xi
-        &mov    ($Htbl,&wparam(1));     # load Htable
-        &mov    ($inp,&wparam(2));      # load in
-        &mov    ($Zlh,&wparam(3));      # load len
-        &call   (&label("pic_point"));
-        &set_label("pic_point");
-        &blindpop("eax");
-        &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-        &add    ($Zlh,$inp);
-        &mov    (&wparam(3),$Zlh);      # len to point at the end of input
-        &stack_push(4+1);               # +1 for stack alignment
-        &mov    ($Zll,&DWP(12,$Zhh));   # load Xi[16]
-        &mov    ($Zhl,&DWP(4,$Zhh));
-        &mov    ($Zlh,&DWP(8,$Zhh));
-        &mov    ($Zhh,&DWP(0,$Zhh));
-        &jmp    (&label("mmx_outer_loop"));
-    &set_label("mmx_outer_loop",16);
-        &xor    ($Zll,&DWP(12,$inp));
-        &xor    ($Zhl,&DWP(4,$inp));
-        &xor    ($Zlh,&DWP(8,$inp));
-        &xor    ($Zhh,&DWP(0,$inp));
-        &mov    (&wparam(2),$inp);
-        &mov    (&DWP(12,"esp"),$Zll);
-        &mov    (&DWP(4,"esp"),$Zhl);
-        &mov    (&DWP(8,"esp"),$Zlh);
-        &mov    (&DWP(0,"esp"),$Zhh);
-        &mov    ($inp,"esp");
-        &shr    ($Zll,24);
-        &call   ("_mmx_gmult_4bit_inner");
-        &mov    ($inp,&wparam(2));
-        &lea    ($inp,&DWP(16,$inp));
-        &cmp    ($inp,&wparam(3));
-        &jb     (&label("mmx_outer_loop"));
-        &mov    ($inp,&wparam(0));      # load Xi
-        &emms   ();
-        &mov    (&DWP(12,$inp),$Zll);
-        &mov    (&DWP(4,$inp),$Zhl);
-        &mov    (&DWP(8,$inp),$Zlh);
-        &mov    (&DWP(0,$inp),$Zhh);
-        &stack_pop(4+1);
-&function_end("gcm_ghash_4bit_mmx");
-}} else {{      # "June" MMX version...
-                # ... has slower "April" gcm_gmult_4bit_mmx with folded
-                # loop. This is done to conserve code size...
-$S=16;          # shift factor for rem_4bit
-sub mmx_loop() {
-# MMX version performs 2.8 times better on P4 (see comment in non-MMX
-# routine for further details), 40% better on Opteron and Core2, 50%
-# better on PIII... In other words effort is considered to be well
-# spent...
-    my $inp = shift;
-    my $rem_4bit = shift;
-    my $cnt = $Zhh;
-    my $nhi = $Zhl;
-    my $nlo = $Zlh;
-    my $rem = $Zll;
-    my ($Zlo,$Zhi) = ("mm0","mm1");
-    my $tmp = "mm2";
-        &xor    ($nlo,$nlo);    # avoid partial register stalls on PIII
-        &mov    ($nhi,$Zll);
-        &mov    (&LB($nlo),&LB($nhi));
-        &mov    ($cnt,14);
-        &shl    (&LB($nlo),4);
-        &and    ($nhi,0xf0);
-        &movq   ($Zlo,&QWP(8,$Htbl,$nlo));
-        &movq   ($Zhi,&QWP(0,$Htbl,$nlo));
-        &movd   ($rem,$Zlo);
-        &jmp    (&label("mmx_loop"));
-    &set_label("mmx_loop",16);
-        &psrlq  ($Zlo,4);
-        &and    ($rem,0xf);
-        &movq   ($tmp,$Zhi);
-        &psrlq  ($Zhi,4);
-        &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
-        &mov    (&LB($nlo),&BP(0,$inp,$cnt));
-        &psllq  ($tmp,60);
-        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
-        &dec    ($cnt);
-        &movd   ($rem,$Zlo);
-        &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
-        &mov    ($nhi,$nlo);
-        &pxor   ($Zlo,$tmp);
-        &js     (&label("mmx_break"));
-        &shl    (&LB($nlo),4);
-        &and    ($rem,0xf);
-        &psrlq  ($Zlo,4);
-        &and    ($nhi,0xf0);
-        &movq   ($tmp,$Zhi);
-        &psrlq  ($Zhi,4);
-        &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
-        &psllq  ($tmp,60);
-        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
-        &movd   ($rem,$Zlo);
-        &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
-        &pxor   ($Zlo,$tmp);
-        &jmp    (&label("mmx_loop"));
-    &set_label("mmx_break",16);
-        &shl    (&LB($nlo),4);
-        &and    ($rem,0xf);
-        &psrlq  ($Zlo,4);
-        &and    ($nhi,0xf0);
-        &movq   ($tmp,$Zhi);
-        &psrlq  ($Zhi,4);
-        &pxor   ($Zlo,&QWP(8,$Htbl,$nlo));
-        &psllq  ($tmp,60);
-        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
-        &movd   ($rem,$Zlo);
-        &pxor   ($Zhi,&QWP(0,$Htbl,$nlo));
-        &pxor   ($Zlo,$tmp);
-        &psrlq  ($Zlo,4);
-        &and    ($rem,0xf);
-        &movq   ($tmp,$Zhi);
-        &psrlq  ($Zhi,4);
-        &pxor   ($Zlo,&QWP(8,$Htbl,$nhi));
-        &psllq  ($tmp,60);
-        &pxor   ($Zhi,&QWP(0,$rem_4bit,$rem,8));
-        &movd   ($rem,$Zlo);
-        &pxor   ($Zhi,&QWP(0,$Htbl,$nhi));
-        &pxor   ($Zlo,$tmp);
-        &psrlq  ($Zlo,32);      # lower part of Zlo is already there
-        &movd   ($Zhl,$Zhi);
-        &psrlq  ($Zhi,32);
-        &movd   ($Zlh,$Zlo);
-        &movd   ($Zhh,$Zhi);
-        &bswap  ($Zll);
-        &bswap  ($Zhl);
-        &bswap  ($Zlh);
-        &bswap  ($Zhh);
-}
-&function_begin("gcm_gmult_4bit_mmx");
-        &mov    ($inp,&wparam(0));      # load Xi
-        &mov    ($Htbl,&wparam(1));     # load Htable
-        &call   (&label("pic_point"));
-        &set_label("pic_point");
-        &blindpop("eax");
-        &lea    ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
-        &movz   ($Zll,&BP(15,$inp));
-        &mmx_loop($inp,"eax");
-        &emms   ();
-        &mov    (&DWP(12,$inp),$Zll);
-        &mov    (&DWP(4,$inp),$Zhl);
-        &mov    (&DWP(8,$inp),$Zlh);
-        &mov    (&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-######################################################################
-# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
-# (see gcm128.c for details). It provides further 20-40% performance
-# improvement over above mentioned "May" version.
-&static_label("rem_8bit");
-&function_begin("gcm_ghash_4bit_mmx");
-{ my ($Zlo,$Zhi) = ("mm7","mm6");
-  my $rem_8bit = "esi";
-  my $Htbl = "ebx";
-    # parameter block
-    &mov        ("eax",&wparam(0));             # Xi
-    &mov        ("ebx",&wparam(1));             # Htable
-    &mov        ("ecx",&wparam(2));             # inp
-    &mov        ("edx",&wparam(3));             # len
-    &mov        ("ebp","esp");                  # original %esp
-    &call       (&label("pic_point"));
-    &set_label  ("pic_point");
-    &blindpop   ($rem_8bit);
-    &lea        ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
-    &sub        ("esp",512+16+16);              # allocate stack frame...
-    &and        ("esp",-64);                    # ...and align it
-    &sub        ("esp",16);                     # place for (u8)(H[]<<4)
-    &add        ("edx","ecx");                  # pointer to the end of input
-    &mov        (&DWP(528+16+0,"esp"),"eax");   # save Xi
-    &mov        (&DWP(528+16+8,"esp"),"edx");   # save inp+len
-    &mov        (&DWP(528+16+12,"esp"),"ebp");  # save original %esp
-    { my @lo  = ("mm0","mm1","mm2");
-      my @hi  = ("mm3","mm4","mm5");
-      my @tmp = ("mm6","mm7");
-      my ($off1,$off2,$i) = (0,0,);
-      &add      ($Htbl,128);                    # optimize for size
-      &lea      ("edi",&DWP(16+128,"esp"));
-      &lea      ("ebp",&DWP(16+256+128,"esp"));
-      # decompose Htable (low and high parts are kept separately),
-      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
-      for ($i=0;$i<18;$i++) {
-        &mov    ("edx",&DWP(16*$i+8-128,$Htbl))         if ($i<16);
-        &movq   ($lo[0],&QWP(16*$i+8-128,$Htbl))        if ($i<16);
-        &psllq  ($tmp[1],60)                            if ($i>1);
-        &movq   ($hi[0],&QWP(16*$i+0-128,$Htbl))        if ($i<16);
-        &por    ($lo[2],$tmp[1])                        if ($i>1);
-        &movq   (&QWP($off1-128,"edi"),$lo[1])          if ($i>0 && $i<17);
-        &psrlq  ($lo[1],4)                              if ($i>0 && $i<17);
-        &movq   (&QWP($off1,"edi"),$hi[1])              if ($i>0 && $i<17);
-        &movq   ($tmp[0],$hi[1])                        if ($i>0 && $i<17);
-        &movq   (&QWP($off2-128,"ebp"),$lo[2])          if ($i>1);
-        &psrlq  ($hi[1],4)                              if ($i>0 && $i<17);
-        &movq   (&QWP($off2,"ebp"),$hi[2])              if ($i>1);
-        &shl    ("edx",4)                               if ($i<16);
-        &mov    (&BP($i,"esp"),&LB("edx"))              if ($i<16);
-        unshift (@lo,pop(@lo));                 # "rotate" registers
-        unshift (@hi,pop(@hi));
-        unshift (@tmp,pop(@tmp));
-        $off1 += 8      if ($i>0);
-        $off2 += 8      if ($i>1);
-      }
-    }
-    &movq       ($Zhi,&QWP(0,"eax"));
-    &mov        ("ebx",&DWP(8,"eax"));
-    &mov        ("edx",&DWP(12,"eax"));         # load Xi
-&set_label("outer",16);
-  { my $nlo = "eax";
-    my $dat = "edx";
-    my @nhi = ("edi","ebp");
-    my @rem = ("ebx","ecx");
-    my @red = ("mm0","mm1","mm2");
-    my $tmp = "mm3";
-    &xor        ($dat,&DWP(12,"ecx"));          # merge input data
-    &xor        ("ebx",&DWP(8,"ecx"));
-    &pxor       ($Zhi,&QWP(0,"ecx"));
-    &lea        ("ecx",&DWP(16,"ecx"));         # inp+=16
-    #&mov       (&DWP(528+12,"esp"),$dat);      # save inp^Xi
-    &mov        (&DWP(528+8,"esp"),"ebx");
-    &movq       (&QWP(528+0,"esp"),$Zhi);
-    &mov        (&DWP(528+16+4,"esp"),"ecx");   # save inp
-    &xor        ($nlo,$nlo);
-    &rol        ($dat,8);
-    &mov        (&LB($nlo),&LB($dat));
-    &mov        ($nhi[1],$nlo);
-    &and        (&LB($nlo),0x0f);
-    &shr        ($nhi[1],4);
-    &pxor       ($red[0],$red[0]);
-    &rol        ($dat,8);                       # next byte
-    &pxor       ($red[1],$red[1]);
-    &pxor       ($red[2],$red[2]);
-    # Just like in "May" verson modulo-schedule for critical path in
-    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
-    # is scheduled so late that rem_8bit[] has to be shifted *right*
-    # by 16, which is why last argument to pinsrw is 2, which
-    # corresponds to <<32=<<48>>16...
-    for ($j=11,$i=0;$i<15;$i++) {
-      if ($i>0) {
-        &pxor   ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
-        &rol    ($dat,8);                               # next byte
-        &pxor   ($Zhi,&QWP(16+128,"esp",$nlo,8));
-        &pxor   ($Zlo,$tmp);
-        &pxor   ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
-        &xor    (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
-      } else {
-        &movq   ($Zlo,&QWP(16,"esp",$nlo,8));
-        &movq   ($Zhi,&QWP(16+128,"esp",$nlo,8));
-      }
-        &mov    (&LB($nlo),&LB($dat));
-        &mov    ($dat,&DWP(528+$j,"esp"))               if (--$j%4==0);
-        &movd   ($rem[0],$Zlo);
-        &movz   ($rem[1],&LB($rem[1]))                  if ($i>0);
-        &psrlq  ($Zlo,8);                               # Z>>=8
-        &movq   ($tmp,$Zhi);
-        &mov    ($nhi[0],$nlo);
-        &psrlq  ($Zhi,8);
-        &pxor   ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));  # Z^=H[nhi]>>4
-        &and    (&LB($nlo),0x0f);
-        &psllq  ($tmp,56);
-        &pxor   ($Zhi,$red[1])                          if ($i>1);
-        &shr    ($nhi[0],4);
-        &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2)  if ($i>0);
-        unshift (@red,pop(@red));                       # "rotate" registers
-        unshift (@rem,pop(@rem));
-        unshift (@nhi,pop(@nhi));
-    }
-    &pxor       ($Zlo,&QWP(16,"esp",$nlo,8));           # Z^=H[nlo]
-    &pxor       ($Zhi,&QWP(16+128,"esp",$nlo,8));
-    &xor        (&LB($rem[1]),&BP(0,"esp",$nhi[0]));    # rem^(H[nhi]<<4)
-    &pxor       ($Zlo,$tmp);
-    &pxor       ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
-    &movz       ($rem[1],&LB($rem[1]));
-    &pxor       ($red[2],$red[2]);                      # clear 2nd word
-    &psllq      ($red[1],4);
-    &movd       ($rem[0],$Zlo);
-    &psrlq      ($Zlo,4);                               # Z>>=4
-    &movq       ($tmp,$Zhi);
-    &psrlq      ($Zhi,4);
-    &shl        ($rem[0],4);                            # rem<<4
-    &pxor       ($Zlo,&QWP(16,"esp",$nhi[1],8));        # Z^=H[nhi]
-    &psllq      ($tmp,60);
-    &movz       ($rem[0],&LB($rem[0]));
-    &pxor       ($Zlo,$tmp);
-    &pxor       ($Zhi,&QWP(16+128,"esp",$nhi[1],8));
-    &pinsrw     ($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
-    &pxor       ($Zhi,$red[1]);
-    &movd       ($dat,$Zlo);
-    &pinsrw     ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
-    &psllq      ($red[0],12);                           # correct by <<16>>4
-    &pxor       ($Zhi,$red[0]);
-    &psrlq      ($Zlo,32);
-    &pxor       ($Zhi,$red[2]);
-    &mov        ("ecx",&DWP(528+16+4,"esp"));   # restore inp
-    &movd       ("ebx",$Zlo);
-    &movq       ($tmp,$Zhi);                    # 01234567
-    &psllw      ($Zhi,8);                       # 1.3.5.7.
-    &psrlw      ($tmp,8);                       # .0.2.4.6
-    &por        ($Zhi,$tmp);                    # 10325476
-    &bswap      ($dat);
-    &pshufw     ($Zhi,$Zhi,0b00011011);         # 76543210
-    &bswap      ("ebx");
-    
-    &cmp        ("ecx",&DWP(528+16+8,"esp"));   # are we done?
-    &jne        (&label("outer"));
-  }
-    &mov        ("eax",&DWP(528+16+0,"esp"));   # restore Xi
-    &mov        (&DWP(12,"eax"),"edx");
-    &mov        (&DWP(8,"eax"),"ebx");
-    &movq       (&QWP(0,"eax"),$Zhi);
-    &mov        ("esp",&DWP(528+16+12,"esp"));  # restore original %esp
-    &emms       ();
-}
-&function_end("gcm_ghash_4bit_mmx");
-}}
-if ($sse2) {{
-######################################################################
-# PCLMULQDQ version.
-$Xip="eax";
-$Htbl="edx";
-$const="ecx";
-$inp="esi";
-$len="ebx";
-($Xi,$Xhi)=("xmm0","xmm1");     $Hkey="xmm2";
-($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
-($Xn,$Xhn)=("xmm6","xmm7");
-&static_label("bswap");
-sub clmul64x64_T2 {     # minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
-        &movdqa         ($Xhi,$Xi);             #
-        &pshufd         ($T1,$Xi,0b01001110);
-        &pshufd         ($T2,$Hkey,0b01001110);
-        &pxor           ($T1,$Xi);              #
-        &pxor           ($T2,$Hkey);
-        &pclmulqdq      ($Xi,$Hkey,0x00);       #######
-        &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
-        &pclmulqdq      ($T1,$T2,0x00);         #######
-        &xorps          ($T1,$Xi);              #
-        &xorps          ($T1,$Xhi);             #
-        &movdqa         ($T2,$T1);              #
-        &psrldq         ($T1,8);
-        &pslldq         ($T2,8);                #
-        &pxor           ($Xhi,$T1);
-        &pxor           ($Xi,$T2);              #
-}
-sub clmul64x64_T3 {
-# Even though this subroutine offers visually better ILP, it
-# was empirically found to be a tad slower than above version.
-# At least in gcm_ghash_clmul context. But it's just as well,
-# because loop modulo-scheduling is possible only thanks to
-# minimized "register" pressure...
-my ($Xhi,$Xi,$Hkey)=@_;
-        &movdqa         ($T1,$Xi);              #
-        &movdqa         ($Xhi,$Xi);
-        &pclmulqdq      ($Xi,$Hkey,0x00);       #######
-        &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
-        &pshufd         ($T2,$T1,0b01001110);   #
-        &pshufd         ($T3,$Hkey,0b01001110);
-        &pxor           ($T2,$T1);              #
-        &pxor           ($T3,$Hkey);
-        &pclmulqdq      ($T2,$T3,0x00);         #######
-        &pxor           ($T2,$Xi);              #
-        &pxor           ($T2,$Xhi);             #
-        &movdqa         ($T3,$T2);              #
-        &psrldq         ($T2,8);
-        &pslldq         ($T3,8);                #
-        &pxor           ($Xhi,$T2);
-        &pxor           ($Xi,$T3);              #
-}
-if (1) {                # Algorithm 9 with <<1 twist.
-                        # Reduction is shorter and uses only two
-                        # temporary registers, which makes it better
-                        # candidate for interleaving with 64x64
-                        # multiplication. Pre-modulo-scheduled loop
-                        # was found to be ~20% faster than Algorithm 5
-                        # below. Algorithm 9 was therefore chosen for
-                        # further optimization...
-sub reduction_alg9 {    # 17/13 times faster than Intel version
-my ($Xhi,$Xi) = @_;
-        # 1st phase
-        &movdqa         ($T1,$Xi);              #
-        &psllq          ($Xi,1);
-        &pxor           ($Xi,$T1);              #
-        &psllq          ($Xi,5);                #
-        &pxor           ($Xi,$T1);              #
-        &psllq          ($Xi,57);               #
-        &movdqa         ($T2,$Xi);              #
-        &pslldq         ($Xi,8);
-        &psrldq         ($T2,8);                #
-        &pxor           ($Xi,$T1);
-        &pxor           ($Xhi,$T2);             #
-        # 2nd phase
-        &movdqa         ($T2,$Xi);
-        &psrlq          ($Xi,5);
-        &pxor           ($Xi,$T2);              #
-        &psrlq          ($Xi,1);                #
-        &pxor           ($Xi,$T2);              #
-        &pxor           ($T2,$Xhi);
-        &psrlq          ($Xi,1);                #
-        &pxor           ($Xi,$T2);              #
-}
-&function_begin_B("gcm_init_clmul");
-        &mov            ($Htbl,&wparam(0));
-        &mov            ($Xip,&wparam(1));
-        &call           (&label("pic"));
-&set_label("pic");
-        &blindpop       ($const);
-        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-        &movdqu         ($Hkey,&QWP(0,$Xip));
-        &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
-        # <<1 twist
-        &pshufd         ($T2,$Hkey,0b11111111); # broadcast uppermost dword
-        &movdqa         ($T1,$Hkey);
-        &psllq          ($Hkey,1);
-        &pxor           ($T3,$T3);              #
-        &psrlq          ($T1,63);
-        &pcmpgtd        ($T3,$T2);              # broadcast carry bit
-        &pslldq         ($T1,8);
-        &por            ($Hkey,$T1);            # H<<=1
-        # magic reduction
-        &pand           ($T3,&QWP(16,$const));  # 0x1c2_polynomial
-        &pxor           ($Hkey,$T3);            # if(carry) H^=0x1c2_polynomial
-        # calculate H^2
-        &movdqa         ($Xi,$Hkey);
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
-        &reduction_alg9 ($Xhi,$Xi);
-        &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
-        &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
-        &ret            ();
-&function_end_B("gcm_init_clmul");
-&function_begin_B("gcm_gmult_clmul");
-        &mov            ($Xip,&wparam(0));
-        &mov            ($Htbl,&wparam(1));
-        &call           (&label("pic"));
-&set_label("pic");
-        &blindpop       ($const);
-        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-        &movdqu         ($Xi,&QWP(0,$Xip));
-        &movdqa         ($T3,&QWP(0,$const));
-        &movups         ($Hkey,&QWP(0,$Htbl));
-        &pshufb         ($Xi,$T3);
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
-        &reduction_alg9 ($Xhi,$Xi);
-        &pshufb         ($Xi,$T3);
-        &movdqu         (&QWP(0,$Xip),$Xi);
-        &ret    ();
-&function_end_B("gcm_gmult_clmul");
-&function_begin("gcm_ghash_clmul");
-        &mov            ($Xip,&wparam(0));
-        &mov            ($Htbl,&wparam(1));
-        &mov            ($inp,&wparam(2));
-        &mov            ($len,&wparam(3));
-        &call           (&label("pic"));
-&set_label("pic");
-        &blindpop       ($const);
-        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-        &movdqu         ($Xi,&QWP(0,$Xip));
-        &movdqa         ($T3,&QWP(0,$const));
-        &movdqu         ($Hkey,&QWP(0,$Htbl));
-        &pshufb         ($Xi,$T3);
-        &sub            ($len,0x10);
-        &jz             (&label("odd_tail"));
-        #######
-        # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
-        #       [(H*Ii+1) + (H*Xi+1)] mod P =
-        #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
-        #
-        &movdqu         ($T1,&QWP(0,$inp));     # Ii
-        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
-        &pshufb         ($T1,$T3);
-        &pshufb         ($Xn,$T3);
-        &pxor           ($Xi,$T1);              # Ii+Xi
-        &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
-        &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
-        &lea            ($inp,&DWP(32,$inp));   # i+=2
-        &sub            ($len,0x20);
-        &jbe            (&label("even_tail"));
-&set_label("mod_loop");
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
-        &movdqu         ($T1,&QWP(0,$inp));     # Ii
-        &movups         ($Hkey,&QWP(0,$Htbl));  # load H
-        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
-        &pxor           ($Xhi,$Xhn);
-        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
-        &pshufb         ($T1,$T3);
-        &pshufb         ($Xn,$T3);
-        &movdqa         ($T3,$Xn);              #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
-        &movdqa         ($Xhn,$Xn);
-         &pxor          ($Xhi,$T1);             # "Ii+Xi", consume early
-          &movdqa       ($T1,$Xi);              #&reduction_alg9($Xhi,$Xi); 1st phase
-          &psllq        ($Xi,1);
-          &pxor         ($Xi,$T1);              #
-          &psllq        ($Xi,5);                #
-          &pxor         ($Xi,$T1);              #
-        &pclmulqdq      ($Xn,$Hkey,0x00);       #######
-          &psllq        ($Xi,57);               #
-          &movdqa       ($T2,$Xi);              #
-          &pslldq       ($Xi,8);
-          &psrldq       ($T2,8);                #       
-          &pxor         ($Xi,$T1);
-        &pshufd         ($T1,$T3,0b01001110);
-          &pxor         ($Xhi,$T2);             #
-        &pxor           ($T1,$T3);
-        &pshufd         ($T3,$Hkey,0b01001110);
-        &pxor           ($T3,$Hkey);            #
-        &pclmulqdq      ($Xhn,$Hkey,0x11);      #######
-          &movdqa       ($T2,$Xi);              # 2nd phase
-          &psrlq        ($Xi,5);
-          &pxor         ($Xi,$T2);              #
-          &psrlq        ($Xi,1);                #
-          &pxor         ($Xi,$T2);              #
-          &pxor         ($T2,$Xhi);
-          &psrlq        ($Xi,1);                #
-          &pxor         ($Xi,$T2);              #
-        &pclmulqdq      ($T1,$T3,0x00);         #######
-        &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
-        &xorps          ($T1,$Xn);              #
-        &xorps          ($T1,$Xhn);             #
-        &movdqa         ($T3,$T1);              #
-        &psrldq         ($T1,8);
-        &pslldq         ($T3,8);                #
-        &pxor           ($Xhn,$T1);
-        &pxor           ($Xn,$T3);              #
-        &movdqa         ($T3,&QWP(0,$const));
-        &lea            ($inp,&DWP(32,$inp));
-        &sub            ($len,0x20);
-        &ja             (&label("mod_loop"));
-&set_label("even_tail");
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
-        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
-        &pxor           ($Xhi,$Xhn);
-        &reduction_alg9 ($Xhi,$Xi);
-        &test           ($len,$len);
-        &jnz            (&label("done"));
-        &movups         ($Hkey,&QWP(0,$Htbl));  # load H
-&set_label("odd_tail");
-        &movdqu         ($T1,&QWP(0,$inp));     # Ii
-        &pshufb         ($T1,$T3);
-        &pxor           ($Xi,$T1);              # Ii+Xi
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
-        &reduction_alg9 ($Xhi,$Xi);
-&set_label("done");
-        &pshufb         ($Xi,$T3);
-        &movdqu         (&QWP(0,$Xip),$Xi);
-&function_end("gcm_ghash_clmul");
-} else {                # Algorith 5. Kept for reference purposes.
-sub reduction_alg5 {    # 19/16 times faster than Intel version
-my ($Xhi,$Xi)=@_;
-        # <<1
-        &movdqa         ($T1,$Xi);              #
-        &movdqa         ($T2,$Xhi);
-        &pslld          ($Xi,1);
-        &pslld          ($Xhi,1);               #
-        &psrld          ($T1,31);
-        &psrld          ($T2,31);               #
-        &movdqa         ($T3,$T1);
-        &pslldq         ($T1,4);
-        &psrldq         ($T3,12);               #
-        &pslldq         ($T2,4);
-        &por            ($Xhi,$T3);             #
-        &por            ($Xi,$T1);
-        &por            ($Xhi,$T2);             #
-        # 1st phase
-        &movdqa         ($T1,$Xi);
-        &movdqa         ($T2,$Xi);
-        &movdqa         ($T3,$Xi);              #
-        &pslld          ($T1,31);
-        &pslld          ($T2,30);
-        &pslld          ($Xi,25);               #
-        &pxor           ($T1,$T2);
-        &pxor           ($T1,$Xi);              #
-        &movdqa         ($T2,$T1);              #
-        &pslldq         ($T1,12);
-        &psrldq         ($T2,4);                #
-        &pxor           ($T3,$T1);
-        # 2nd phase
-        &pxor           ($Xhi,$T3);             #
-        &movdqa         ($Xi,$T3);
-        &movdqa         ($T1,$T3);
-        &psrld          ($Xi,1);                #
-        &psrld          ($T1,2);
-        &psrld          ($T3,7);                #
-        &pxor           ($Xi,$T1);
-        &pxor           ($Xhi,$T2);
-        &pxor           ($Xi,$T3);              #
-        &pxor           ($Xi,$Xhi);             #
-}
-&function_begin_B("gcm_init_clmul");
-        &mov            ($Htbl,&wparam(0));
-        &mov            ($Xip,&wparam(1));
-        &call           (&label("pic"));
-&set_label("pic");
-        &blindpop       ($const);
-        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-        &movdqu         ($Hkey,&QWP(0,$Xip));
-        &pshufd         ($Hkey,$Hkey,0b01001110);# dword swap
-        # calculate H^2
-        &movdqa         ($Xi,$Hkey);
-        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
-        &reduction_alg5 ($Xhi,$Xi);
-        &movdqu         (&QWP(0,$Htbl),$Hkey);  # save H
-        &movdqu         (&QWP(16,$Htbl),$Xi);   # save H^2
-        &ret            ();
-&function_end_B("gcm_init_clmul");
-&function_begin_B("gcm_gmult_clmul");
-        &mov            ($Xip,&wparam(0));
-        &mov            ($Htbl,&wparam(1));
-        &call           (&label("pic"));
-&set_label("pic");
-        &blindpop       ($const);
-        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-        &movdqu         ($Xi,&QWP(0,$Xip));
-        &movdqa         ($Xn,&QWP(0,$const));
-        &movdqu         ($Hkey,&QWP(0,$Htbl));
-        &pshufb         ($Xi,$Xn);
-        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);
-        &reduction_alg5 ($Xhi,$Xi);
-        &pshufb         ($Xi,$Xn);
-        &movdqu         (&QWP(0,$Xip),$Xi);
-        &ret    ();
-&function_end_B("gcm_gmult_clmul");
-&function_begin("gcm_ghash_clmul");
-        &mov            ($Xip,&wparam(0));
-        &mov            ($Htbl,&wparam(1));
-        &mov            ($inp,&wparam(2));
-        &mov            ($len,&wparam(3));
-        &call           (&label("pic"));
-&set_label("pic");
-        &blindpop       ($const);
-        &lea            ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
-        &movdqu         ($Xi,&QWP(0,$Xip));
-        &movdqa         ($T3,&QWP(0,$const));
-        &movdqu         ($Hkey,&QWP(0,$Htbl));
-        &pshufb         ($Xi,$T3);
-        &sub            ($len,0x10);
-        &jz             (&label("odd_tail"));
-        #######
-        # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
-        #       [(H*Ii+1) + (H*Xi+1)] mod P =
-        #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
-        #
-        &movdqu         ($T1,&QWP(0,$inp));     # Ii
-        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
-        &pshufb         ($T1,$T3);
-        &pshufb         ($Xn,$T3);
-        &pxor           ($Xi,$T1);              # Ii+Xi
-        &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
-        &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
-        &sub            ($len,0x20);
-        &lea            ($inp,&DWP(32,$inp));   # i+=2
-        &jbe            (&label("even_tail"));
-&set_label("mod_loop");
-        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
-        &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
-        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
-        &pxor           ($Xhi,$Xhn);
-        &reduction_alg5 ($Xhi,$Xi);
-        #######
-        &movdqa         ($T3,&QWP(0,$const));
-        &movdqu         ($T1,&QWP(0,$inp));     # Ii
-        &movdqu         ($Xn,&QWP(16,$inp));    # Ii+1
-        &pshufb         ($T1,$T3);
-        &pshufb         ($Xn,$T3);
-        &pxor           ($Xi,$T1);              # Ii+Xi
-        &clmul64x64_T3  ($Xhn,$Xn,$Hkey);       # H*Ii+1
-        &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
-        &sub            ($len,0x20);
-        &lea            ($inp,&DWP(32,$inp));
-        &ja             (&label("mod_loop"));
-&set_label("even_tail");
-        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
-        &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
-        &pxor           ($Xhi,$Xhn);
-        &reduction_alg5 ($Xhi,$Xi);
-        &movdqa         ($T3,&QWP(0,$const));
-        &test           ($len,$len);
-        &jnz            (&label("done"));
-        &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
-&set_label("odd_tail");
-        &movdqu         ($T1,&QWP(0,$inp));     # Ii
-        &pshufb         ($T1,$T3);
-        &pxor           ($Xi,$T1);              # Ii+Xi
-        &clmul64x64_T3  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
-        &reduction_alg5 ($Xhi,$Xi);
-        &movdqa         ($T3,&QWP(0,$const));
-&set_label("done");
-        &pshufb         ($Xi,$T3);
-        &movdqu         (&QWP(0,$Xip),$Xi);
-&function_end("gcm_ghash_clmul");
-}
-&set_label("bswap",64);
-        &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
-        &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
-}}      # $sse2
-&set_label("rem_4bit",64);
-        &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
-        &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
-        &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
-        &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
-&set_label("rem_8bit",64);
-        &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
-        &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
-        &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
-        &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
-        &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
-        &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
-        &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
-        &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
-        &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
-        &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
-        &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
-        &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
-        &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
-        &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
-        &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
-        &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
-        &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
-        &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
-        &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
-        &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
-        &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
-        &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
-        &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
-        &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
-        &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
-        &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
-        &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
-        &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
-        &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
-        &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
-        &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
-        &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
-}}}     # !$x86only
-&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
-&asm_finish();
-# A question was risen about choice of vanilla MMX. Or rather why wasn't
-# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
-# CPUs such as PIII, "4-bit" MMX version was observed to provide better
-# performance than *corresponding* SSE2 one even on contemporary CPUs.
-# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
-# implementation featuring full range of lookup-table sizes, but with
-# per-invocation lookup table setup. Latter means that table size is
-# chosen depending on how much data is to be hashed in every given call,
-# more data - larger table. Best reported result for Core2 is ~4 cycles
-# per processed byte out of 64KB block. This number accounts even for
-# 64KB table setup overhead. As discussed in gcm128.c we choose to be
-# more conservative in respect to lookup table sizes, but how do the
-# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
-# on same platform. As also discussed in gcm128.c, next in line "8-bit
-# Shoup's" or "4KB" method should deliver twice the performance of
-# "256B" one, in other words not worse than ~6 cycles per byte. It
-# should be also be noted that in SSE2 case improvement can be "super-
-# linear," i.e. more than twice, mostly because >>8 maps to single
-# instruction on SSE2 register. This is unlike "4-bit" case when >>4
-# maps to same amount of instructions in both MMX and SSE2 cases.
-# Bottom line is that switch to SSE2 is considered to be justifiable
-# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
deleted file mode 100644
index 38d779edbc..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
+++ /dev/null
@@ -1,806 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March, June 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that
-# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
-# function features so called "528B" variant utilizing additional
-# 256+16 bytes of per-key storage [+512 bytes shared table].
-# Performance results are for this streamed GHASH subroutine and are
-# expressed in cycles per processed byte, less is better:
-#
-#               gcc 3.4.x(*)    assembler
-#
-# P4            28.6            14.0            +100%
-# Opteron       19.3            7.7             +150%
-# Core2         17.8            8.1(**)         +120%
-#
-# (*)   comparison is not completely fair, because C results are
-#       for vanilla "256B" implementation, while assembler results
-#       are for "528B";-)
-# (**)  it's mystery [to me] why Core2 result is not same as for
-#       Opteron;
-# May 2010
-#
-# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
-# See ghash-x86.pl for background information and details about coding
-# techniques.
-#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-# common register layout
-$nlo="%rax";
-$nhi="%rbx";
-$Zlo="%r8";
-$Zhi="%r9";
-$tmp="%r10";
-$rem_4bit = "%r11";
-$Xi="%rdi";
-$Htbl="%rsi";
-# per-function register layout
-$cnt="%rcx";
-$rem="%rdx";
-sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/     or
-                        $r =~ s/%[er]([sd]i)/%\1l/      or
-                        $r =~ s/%[er](bp)/%\1l/         or
-                        $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
-sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
-  my $arg = pop;
-    $arg = "\$$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
-}
-{ my $N;
-  sub loop() {
-  my $inp = shift;
-        $N++;
-$code.=<<___;
-        xor     $nlo,$nlo
-        xor     $nhi,$nhi
-        mov     `&LB("$Zlo")`,`&LB("$nlo")`
-        mov     `&LB("$Zlo")`,`&LB("$nhi")`
-        shl     \$4,`&LB("$nlo")`
-        mov     \$14,$cnt
-        mov     8($Htbl,$nlo),$Zlo
-        mov     ($Htbl,$nlo),$Zhi
-        and     \$0xf0,`&LB("$nhi")`
-        mov     $Zlo,$rem
-        jmp     .Loop$N
-.align  16
-.Loop$N:
-        shr     \$4,$Zlo
-        and     \$0xf,$rem
-        mov     $Zhi,$tmp
-        mov     ($inp,$cnt),`&LB("$nlo")`
-        shr     \$4,$Zhi
-        xor     8($Htbl,$nhi),$Zlo
-        shl     \$60,$tmp
-        xor     ($Htbl,$nhi),$Zhi
-        mov     `&LB("$nlo")`,`&LB("$nhi")`
-        xor     ($rem_4bit,$rem,8),$Zhi
-        mov     $Zlo,$rem
-        shl     \$4,`&LB("$nlo")`
-        xor     $tmp,$Zlo
-        dec     $cnt
-        js      .Lbreak$N
-        shr     \$4,$Zlo
-        and     \$0xf,$rem
-        mov     $Zhi,$tmp
-        shr     \$4,$Zhi
-        xor     8($Htbl,$nlo),$Zlo
-        shl     \$60,$tmp
-        xor     ($Htbl,$nlo),$Zhi
-        and     \$0xf0,`&LB("$nhi")`
-        xor     ($rem_4bit,$rem,8),$Zhi
-        mov     $Zlo,$rem
-        xor     $tmp,$Zlo
-        jmp     .Loop$N
-.align  16
-.Lbreak$N:
-        shr     \$4,$Zlo
-        and     \$0xf,$rem
-        mov     $Zhi,$tmp
-        shr     \$4,$Zhi
-        xor     8($Htbl,$nlo),$Zlo
-        shl     \$60,$tmp
-        xor     ($Htbl,$nlo),$Zhi
-        and     \$0xf0,`&LB("$nhi")`
-        xor     ($rem_4bit,$rem,8),$Zhi
-        mov     $Zlo,$rem
-        xor     $tmp,$Zlo
-        shr     \$4,$Zlo
-        and     \$0xf,$rem
-        mov     $Zhi,$tmp
-        shr     \$4,$Zhi
-        xor     8($Htbl,$nhi),$Zlo
-        shl     \$60,$tmp
-        xor     ($Htbl,$nhi),$Zhi
-        xor     $tmp,$Zlo
-        xor     ($rem_4bit,$rem,8),$Zhi
-        bswap   $Zlo
-        bswap   $Zhi
-___
-}}
-$code=<<___;
-.text
-.globl  gcm_gmult_4bit
-.type   gcm_gmult_4bit,\@function,2
-.align  16
-gcm_gmult_4bit:
-        push    %rbx
-        push    %rbp            # %rbp and %r12 are pushed exclusively in
-        push    %r12            # order to reuse Win64 exception handler...
-.Lgmult_prologue:
-        movzb   15($Xi),$Zlo
-        lea     .Lrem_4bit(%rip),$rem_4bit
-___
-        &loop   ($Xi);
-$code.=<<___;
-        mov     $Zlo,8($Xi)
-        mov     $Zhi,($Xi)
-        mov     16(%rsp),%rbx
-        lea     24(%rsp),%rsp
-.Lgmult_epilogue:
-        ret
-.size   gcm_gmult_4bit,.-gcm_gmult_4bit
-___
-# per-function register layout
-$inp="%rdx";
-$len="%rcx";
-$rem_8bit=$rem_4bit;
-$code.=<<___;
-.globl  gcm_ghash_4bit
-.type   gcm_ghash_4bit,\@function,4
-.align  16
-gcm_ghash_4bit:
-        push    %rbx
-        push    %rbp
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
-        sub     \$280,%rsp
-.Lghash_prologue:
-        mov     $inp,%r14               # reassign couple of args
-        mov     $len,%r15
-___
-{ my $inp="%r14";
-  my $dat="%edx";
-  my $len="%r15";
-  my @nhi=("%ebx","%ecx");
-  my @rem=("%r12","%r13");
-  my $Hshr4="%rbp";
-        &sub    ($Htbl,-128);           # size optimization
-        &lea    ($Hshr4,"16+128(%rsp)");
-        { my @lo =($nlo,$nhi);
-          my @hi =($Zlo,$Zhi);
-          &xor  ($dat,$dat);
-          for ($i=0,$j=-2;$i<18;$i++,$j++) {
-            &mov        ("$j(%rsp)",&LB($dat))          if ($i>1);
-            &or         ($lo[0],$tmp)                   if ($i>1);
-            &mov        (&LB($dat),&LB($lo[1]))         if ($i>0 && $i<17);
-            &shr        ($lo[1],4)                      if ($i>0 && $i<17);
-            &mov        ($tmp,$hi[1])                   if ($i>0 && $i<17);
-            &shr        ($hi[1],4)                      if ($i>0 && $i<17);
-            &mov        ("8*$j($Hshr4)",$hi[0])         if ($i>1);
-            &mov        ($hi[0],"16*$i+0-128($Htbl)")   if ($i<16);
-            &shl        (&LB($dat),4)                   if ($i>0 && $i<17);
-            &mov        ("8*$j-128($Hshr4)",$lo[0])     if ($i>1);
-            &mov        ($lo[0],"16*$i+8-128($Htbl)")   if ($i<16);
-            &shl        ($tmp,60)                       if ($i>0 && $i<17);
-            push        (@lo,shift(@lo));
-            push        (@hi,shift(@hi));
-          }
-        }
-        &add    ($Htbl,-128);
-        &mov    ($Zlo,"8($Xi)");
-        &mov    ($Zhi,"0($Xi)");
-        &add    ($len,$inp);            # pointer to the end of data
-        &lea    ($rem_8bit,".Lrem_8bit(%rip)");
-        &jmp    (".Louter_loop");
-$code.=".align  16\n.Louter_loop:\n";
-        &xor    ($Zhi,"($inp)");
-        &mov    ("%rdx","8($inp)");
-        &lea    ($inp,"16($inp)");
-        &xor    ("%rdx",$Zlo);
-        &mov    ("($Xi)",$Zhi);
-        &mov    ("8($Xi)","%rdx");
-        &shr    ("%rdx",32);
-        &xor    ($nlo,$nlo);
-        &rol    ($dat,8);
-        &mov    (&LB($nlo),&LB($dat));
-        &movz   ($nhi[0],&LB($dat));
-        &shl    (&LB($nlo),4);
-        &shr    ($nhi[0],4);
-        for ($j=11,$i=0;$i<15;$i++) {
-            &rol        ($dat,8);
-            &xor        ($Zlo,"8($Htbl,$nlo)")                  if ($i>0);
-            &xor        ($Zhi,"($Htbl,$nlo)")                   if ($i>0);
-            &mov        ($Zlo,"8($Htbl,$nlo)")                  if ($i==0);
-            &mov        ($Zhi,"($Htbl,$nlo)")                   if ($i==0);
-            &mov        (&LB($nlo),&LB($dat));
-            &xor        ($Zlo,$tmp)                             if ($i>0);
-            &movzw      ($rem[1],"($rem_8bit,$rem[1],2)")       if ($i>0);
-            &movz       ($nhi[1],&LB($dat));
-            &shl        (&LB($nlo),4);
-            &movzb      ($rem[0],"(%rsp,$nhi[0])");
-            &shr        ($nhi[1],4)                             if ($i<14);
-            &and        ($nhi[1],0xf0)                          if ($i==14);
-            &shl        ($rem[1],48)                            if ($i>0);
-            &xor        ($rem[0],$Zlo);
-            &mov        ($tmp,$Zhi);
-            &xor        ($Zhi,$rem[1])                          if ($i>0);
-            &shr        ($Zlo,8);
-            &movz       ($rem[0],&LB($rem[0]));
-            &mov        ($dat,"$j($Xi)")                        if (--$j%4==0);
-            &shr        ($Zhi,8);
-            &xor        ($Zlo,"-128($Hshr4,$nhi[0],8)");
-            &shl        ($tmp,56);
-            &xor        ($Zhi,"($Hshr4,$nhi[0],8)");
-            unshift     (@nhi,pop(@nhi));               # "rotate" registers
-            unshift     (@rem,pop(@rem));
-        }
-        &movzw  ($rem[1],"($rem_8bit,$rem[1],2)");
-        &xor    ($Zlo,"8($Htbl,$nlo)");
-        &xor    ($Zhi,"($Htbl,$nlo)");
-        &shl    ($rem[1],48);
-        &xor    ($Zlo,$tmp);
-        &xor    ($Zhi,$rem[1]);
-        &movz   ($rem[0],&LB($Zlo));
-        &shr    ($Zlo,4);
-        &mov    ($tmp,$Zhi);
-        &shl    (&LB($rem[0]),4);
-        &shr    ($Zhi,4);
-        &xor    ($Zlo,"8($Htbl,$nhi[0])");
-        &movzw  ($rem[0],"($rem_8bit,$rem[0],2)");
-        &shl    ($tmp,60);
-        &xor    ($Zhi,"($Htbl,$nhi[0])");
-        &xor    ($Zlo,$tmp);
-        &shl    ($rem[0],48);
-        &bswap  ($Zlo);
-        &xor    ($Zhi,$rem[0]);
-        &bswap  ($Zhi);
-        &cmp    ($inp,$len);
-        &jb     (".Louter_loop");
-}
-$code.=<<___;
-        mov     $Zlo,8($Xi)
-        mov     $Zhi,($Xi)
-        lea     280(%rsp),%rsi
-        mov     0(%rsi),%r15
-        mov     8(%rsi),%r14
-        mov     16(%rsi),%r13
-        mov     24(%rsi),%r12
-        mov     32(%rsi),%rbp
-        mov     40(%rsi),%rbx
-        lea     48(%rsi),%rsp
-.Lghash_epilogue:
-        ret
-.size   gcm_ghash_4bit,.-gcm_ghash_4bit
-___
-######################################################################
-# PCLMULQDQ version.
-@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
-                ("%rdi","%rsi","%rdx","%rcx");  # Unix order
-($Xi,$Xhi)=("%xmm0","%xmm1");   $Hkey="%xmm2";
-($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
-sub clmul64x64_T2 {     # minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
-$code.=<<___ if (!defined($modulo));
-        movdqa          $Xi,$Xhi                #
-        pshufd          \$0b01001110,$Xi,$T1
-        pshufd          \$0b01001110,$Hkey,$T2
-        pxor            $Xi,$T1                 #
-        pxor            $Hkey,$T2
-___
-$code.=<<___;
-        pclmulqdq       \$0x00,$Hkey,$Xi        #######
-        pclmulqdq       \$0x11,$Hkey,$Xhi       #######
-        pclmulqdq       \$0x00,$T2,$T1          #######
-        pxor            $Xi,$T1                 #
-        pxor            $Xhi,$T1                #
-        movdqa          $T1,$T2                 #
-        psrldq          \$8,$T1
-        pslldq          \$8,$T2                 #
-        pxor            $T1,$Xhi
-        pxor            $T2,$Xi                 #
-___
-}
-sub reduction_alg9 {    # 17/13 times faster than Intel version
-my ($Xhi,$Xi) = @_;
-$code.=<<___;
-        # 1st phase
-        movdqa          $Xi,$T1                 #
-        psllq           \$1,$Xi
-        pxor            $T1,$Xi                 #
-        psllq           \$5,$Xi                 #
-        pxor            $T1,$Xi                 #
-        psllq           \$57,$Xi                #
-        movdqa          $Xi,$T2                 #
-        pslldq          \$8,$Xi
-        psrldq          \$8,$T2                 #       
-        pxor            $T1,$Xi
-        pxor            $T2,$Xhi                #
-        # 2nd phase
-        movdqa          $Xi,$T2
-        psrlq           \$5,$Xi
-        pxor            $T2,$Xi                 #
-        psrlq           \$1,$Xi                 #
-        pxor            $T2,$Xi                 #
-        pxor            $Xhi,$T2
-        psrlq           \$1,$Xi                 #
-        pxor            $T2,$Xi                 #
-___
-}
-{ my ($Htbl,$Xip)=@_4args;
-$code.=<<___;
-.globl  gcm_init_clmul
-.type   gcm_init_clmul,\@abi-omnipotent
-.align  16
-gcm_init_clmul:
-        movdqu          ($Xip),$Hkey
-        pshufd          \$0b01001110,$Hkey,$Hkey        # dword swap
-        # <<1 twist
-        pshufd          \$0b11111111,$Hkey,$T2  # broadcast uppermost dword
-        movdqa          $Hkey,$T1
-        psllq           \$1,$Hkey
-        pxor            $T3,$T3                 #
-        psrlq           \$63,$T1
-        pcmpgtd         $T2,$T3                 # broadcast carry bit
-        pslldq          \$8,$T1
-        por             $T1,$Hkey               # H<<=1
-        # magic reduction
-        pand            .L0x1c2_polynomial(%rip),$T3
-        pxor            $T3,$Hkey               # if(carry) H^=0x1c2_polynomial
-        # calculate H^2
-        movdqa          $Hkey,$Xi
-___
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
-        &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
-        movdqu          $Hkey,($Htbl)           # save H
-        movdqu          $Xi,16($Htbl)           # save H^2
-        ret
-.size   gcm_init_clmul,.-gcm_init_clmul
-___
-}
-{ my ($Xip,$Htbl)=@_4args;
-$code.=<<___;
-.globl  gcm_gmult_clmul
-.type   gcm_gmult_clmul,\@abi-omnipotent
-.align  16
-gcm_gmult_clmul:
-        movdqu          ($Xip),$Xi
-        movdqa          .Lbswap_mask(%rip),$T3
-        movdqu          ($Htbl),$Hkey
-        pshufb          $T3,$Xi
-___
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
-        &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
-        pshufb          $T3,$Xi
-        movdqu          $Xi,($Xip)
-        ret
-.size   gcm_gmult_clmul,.-gcm_gmult_clmul
-___
-}
-{ my ($Xip,$Htbl,$inp,$len)=@_4args;
-  my $Xn="%xmm6";
-  my $Xhn="%xmm7";
-  my $Hkey2="%xmm8";
-  my $T1n="%xmm9";
-  my $T2n="%xmm10";
-$code.=<<___;
-.globl  gcm_ghash_clmul
-.type   gcm_ghash_clmul,\@abi-omnipotent
-.align  16
-gcm_ghash_clmul:
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_gcm_ghash_clmul:
-        # I can't trust assembler to use specific encoding:-(
-        .byte   0x48,0x83,0xec,0x58             #sub    \$0x58,%rsp
-        .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
-        .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
-        .byte   0x44,0x0f,0x29,0x44,0x24,0x20   #movaps %xmm8,0x20(%rsp)
-        .byte   0x44,0x0f,0x29,0x4c,0x24,0x30   #movaps %xmm9,0x30(%rsp)
-        .byte   0x44,0x0f,0x29,0x54,0x24,0x40   #movaps %xmm10,0x40(%rsp)
-___
-$code.=<<___;
-        movdqa          .Lbswap_mask(%rip),$T3
-        movdqu          ($Xip),$Xi
-        movdqu          ($Htbl),$Hkey
-        pshufb          $T3,$Xi
-        sub             \$0x10,$len
-        jz              .Lodd_tail
-        movdqu          16($Htbl),$Hkey2
-        #######
-        # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
-        #       [(H*Ii+1) + (H*Xi+1)] mod P =
-        #       [(H*Ii+1) + H^2*(Ii+Xi)] mod P
-        #
-        movdqu          ($inp),$T1              # Ii
-        movdqu          16($inp),$Xn            # Ii+1
-        pshufb          $T3,$T1
-        pshufb          $T3,$Xn
-        pxor            $T1,$Xi                 # Ii+Xi
-___
-        &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
-$code.=<<___;
-        movdqa          $Xi,$Xhi                #
-        pshufd          \$0b01001110,$Xi,$T1
-        pshufd          \$0b01001110,$Hkey2,$T2
-        pxor            $Xi,$T1                 #
-        pxor            $Hkey2,$T2
-        lea             32($inp),$inp           # i+=2
-        sub             \$0x20,$len
-        jbe             .Leven_tail
-.Lmod_loop:
-___
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
-$code.=<<___;
-        movdqu          ($inp),$T1              # Ii
-        pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
-        pxor            $Xhn,$Xhi
-        movdqu          16($inp),$Xn            # Ii+1
-        pshufb          $T3,$T1
-        pshufb          $T3,$Xn
-        movdqa          $Xn,$Xhn                #
-        pshufd          \$0b01001110,$Xn,$T1n
-        pshufd          \$0b01001110,$Hkey,$T2n
-        pxor            $Xn,$T1n                #
-        pxor            $Hkey,$T2n
-         pxor           $T1,$Xhi                # "Ii+Xi", consume early
-          movdqa        $Xi,$T1                 # 1st phase
-          psllq         \$1,$Xi
-          pxor          $T1,$Xi                 #
-          psllq         \$5,$Xi                 #
-          pxor          $T1,$Xi                 #
-        pclmulqdq       \$0x00,$Hkey,$Xn        #######
-          psllq         \$57,$Xi                #
-          movdqa        $Xi,$T2                 #
-          pslldq        \$8,$Xi
-          psrldq        \$8,$T2                 #       
-          pxor          $T1,$Xi
-          pxor          $T2,$Xhi                #
-        pclmulqdq       \$0x11,$Hkey,$Xhn       #######
-          movdqa        $Xi,$T2                 # 2nd phase
-          psrlq         \$5,$Xi
-          pxor          $T2,$Xi                 #
-          psrlq         \$1,$Xi                 #
-          pxor          $T2,$Xi                 #
-          pxor          $Xhi,$T2
-          psrlq         \$1,$Xi                 #
-          pxor          $T2,$Xi                 #
-        pclmulqdq       \$0x00,$T2n,$T1n        #######
-         movdqa         $Xi,$Xhi                #
-         pshufd         \$0b01001110,$Xi,$T1
-         pshufd         \$0b01001110,$Hkey2,$T2
-         pxor           $Xi,$T1                 #
-         pxor           $Hkey2,$T2
-        pxor            $Xn,$T1n                #
-        pxor            $Xhn,$T1n               #
-        movdqa          $T1n,$T2n               #
-        psrldq          \$8,$T1n
-        pslldq          \$8,$T2n                #
-        pxor            $T1n,$Xhn
-        pxor            $T2n,$Xn                #
-        lea             32($inp),$inp
-        sub             \$0x20,$len
-        ja              .Lmod_loop
-.Leven_tail:
-___
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey2,1);    # H^2*(Ii+Xi)
-$code.=<<___;
-        pxor            $Xn,$Xi                 # (H*Ii+1) + H^2*(Ii+Xi)
-        pxor            $Xhn,$Xhi
-___
-        &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
-        test            $len,$len
-        jnz             .Ldone
-.Lodd_tail:
-        movdqu          ($inp),$T1              # Ii
-        pshufb          $T3,$T1
-        pxor            $T1,$Xi                 # Ii+Xi
-___
-        &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H*(Ii+Xi)
-        &reduction_alg9 ($Xhi,$Xi);
-$code.=<<___;
-.Ldone:
-        pshufb          $T3,$Xi
-        movdqu          $Xi,($Xip)
-___
-$code.=<<___ if ($win64);
-        movaps  (%rsp),%xmm6
-        movaps  0x10(%rsp),%xmm7
-        movaps  0x20(%rsp),%xmm8
-        movaps  0x30(%rsp),%xmm9
-        movaps  0x40(%rsp),%xmm10
-        add     \$0x58,%rsp
-___
-$code.=<<___;
-        ret
-.LSEH_end_gcm_ghash_clmul:
-.size   gcm_ghash_clmul,.-gcm_ghash_clmul
-___
-}
-$code.=<<___;
-.align  64
-.Lbswap_mask:
-        .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.L0x1c2_polynomial:
-        .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-.align  64
-.type   .Lrem_4bit,\@object
-.Lrem_4bit:
-        .long   0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
-        .long   0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
-        .long   0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
-        .long   0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
-.type   .Lrem_8bit,\@object
-.Lrem_8bit:
-        .value  0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
-        .value  0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
-        .value  0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
-        .value  0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
-        .value  0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
-        .value  0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
-        .value  0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
-        .value  0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
-        .value  0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
-        .value  0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
-        .value  0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
-        .value  0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
-        .value  0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
-        .value  0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
-        .value  0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
-        .value  0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
-        .value  0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
-        .value  0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
-        .value  0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
-        .value  0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
-        .value  0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
-        .value  0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
-        .value  0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
-        .value  0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
-        .value  0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
-        .value  0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
-        .value  0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
-        .value  0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
-        .value  0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
-        .value  0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
-        .value  0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
-        .value  0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
-.asciz  "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align  64
-___
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-$code.=<<___;
-.extern __imp_RtlVirtualUnwind
-.type   se_handler,\@abi-omnipotent
-.align  16
-se_handler:
-        push    %rsi
-        push    %rdi
-        push    %rbx
-        push    %rbp
-        push    %r12
-        push    %r13
-        push    %r14
-        push    %r15
-        pushfq
-        sub     \$64,%rsp
-        mov     120($context),%rax      # pull context->Rax
-        mov     248($context),%rbx      # pull context->Rip
-        mov     8($disp),%rsi           # disp->ImageBase
-        mov     56($disp),%r11          # disp->HandlerData
-        mov     0(%r11),%r10d           # HandlerData[0]
-        lea     (%rsi,%r10),%r10        # prologue label
-        cmp     %r10,%rbx               # context->Rip<prologue label
-        jb      .Lin_prologue
-        mov     152($context),%rax      # pull context->Rsp
-        mov     4(%r11),%r10d           # HandlerData[1]
-        lea     (%rsi,%r10),%r10        # epilogue label
-        cmp     %r10,%rbx               # context->Rip>=epilogue label
-        jae     .Lin_prologue
-        lea     24(%rax),%rax           # adjust "rsp"
-        mov     -8(%rax),%rbx
-        mov     -16(%rax),%rbp
-        mov     -24(%rax),%r12
-        mov     %rbx,144($context)      # restore context->Rbx
-        mov     %rbp,160($context)      # restore context->Rbp
-        mov     %r12,216($context)      # restore context->R12
-.Lin_prologue:
-        mov     8(%rax),%rdi
-        mov     16(%rax),%rsi
-        mov     %rax,152($context)      # restore context->Rsp
-        mov     %rsi,168($context)      # restore context->Rsi
-        mov     %rdi,176($context)      # restore context->Rdi
-        mov     40($disp),%rdi          # disp->ContextRecord
-        mov     $context,%rsi           # context
-        mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
-        .long   0xa548f3fc              # cld; rep movsq
-        mov     $disp,%rsi
-        xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
-        mov     8(%rsi),%rdx            # arg2, disp->ImageBase
-        mov     0(%rsi),%r8             # arg3, disp->ControlPc
-        mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
-        mov     40(%rsi),%r10           # disp->ContextRecord
-        lea     56(%rsi),%r11           # &disp->HandlerData
-        lea     24(%rsi),%r12           # &disp->EstablisherFrame
-        mov     %r10,32(%rsp)           # arg5
-        mov     %r11,40(%rsp)           # arg6
-        mov     %r12,48(%rsp)           # arg7
-        mov     %rcx,56(%rsp)           # arg8, (NULL)
-        call    *__imp_RtlVirtualUnwind(%rip)
-        mov     \$1,%eax                # ExceptionContinueSearch
-        add     \$64,%rsp
-        popfq
-        pop     %r15
-        pop     %r14
-        pop     %r13
-        pop     %r12
-        pop     %rbp
-        pop     %rbx
-        pop     %rdi
-        pop     %rsi
-        ret
-.size   se_handler,.-se_handler
-.section        .pdata
-.align  4
-        .rva    .LSEH_begin_gcm_gmult_4bit
-        .rva    .LSEH_end_gcm_gmult_4bit
-        .rva    .LSEH_info_gcm_gmult_4bit
-        .rva    .LSEH_begin_gcm_ghash_4bit
-        .rva    .LSEH_end_gcm_ghash_4bit
-        .rva    .LSEH_info_gcm_ghash_4bit
-        .rva    .LSEH_begin_gcm_ghash_clmul
-        .rva    .LSEH_end_gcm_ghash_clmul
-        .rva    .LSEH_info_gcm_ghash_clmul
-.section        .xdata
-.align  8
-.LSEH_info_gcm_gmult_4bit:
-        .byte   9,0,0,0
-        .rva    se_handler
-        .rva    .Lgmult_prologue,.Lgmult_epilogue       # HandlerData
-.LSEH_info_gcm_ghash_4bit:
-        .byte   9,0,0,0
-        .rva    se_handler
-        .rva    .Lghash_prologue,.Lghash_epilogue       # HandlerData
-.LSEH_info_gcm_ghash_clmul:
-        .byte   0x01,0x1f,0x0b,0x00
-        .byte   0x1f,0xa8,0x04,0x00     #movaps 0x40(rsp),xmm10
-        .byte   0x19,0x98,0x03,0x00     #movaps 0x30(rsp),xmm9
-        .byte   0x13,0x88,0x02,0x00     #movaps 0x20(rsp),xmm8
-        .byte   0x0d,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
-        .byte   0x08,0x68,0x00,0x00     #movaps (rsp),xmm6
-        .byte   0x04,0xa2,0x00,0x00     #sub    rsp,0x58
-___
-}
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-print $code;
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
deleted file mode 100644
index fe45103b0c..0000000000
--- a/src/lib/libcrypto/modes/cbc128.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/* $OpenBSD: cbc128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#undef STRICT_ALIGNMENT
-#ifdef __STRICT_ALIGNMENT
-#define STRICT_ALIGNMENT 1
-#else
-#define STRICT_ALIGNMENT 0
-#endif
-void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block)
-{
-        size_t n;
-        const unsigned char *iv = ivec;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (STRICT_ALIGNMENT &&
-            ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
-                while (len>=16) {
-                        for(n=0; n<16; ++n)
-                                out[n] = in[n] ^ iv[n];
-                        (*block)(out, out, key);
-                        iv = out;
-                        len -= 16;
-                        in  += 16;
-                        out += 16;
-                }
-        } else {
-                while (len>=16) {
-                        for(n=0; n<16; n+=sizeof(size_t))
-                                *(size_t*)(out+n) =
-                                *(size_t*)(in+n) ^ *(size_t*)(iv+n);
-                        (*block)(out, out, key);
-                        iv = out;
-                        len -= 16;
-                        in  += 16;
-                        out += 16;
-                }
-        }
-#endif
-        while (len) {
-                for(n=0; n<16 && n<len; ++n)
-                        out[n] = in[n] ^ iv[n];
-                for(; n<16; ++n)
-                        out[n] = iv[n];
-                (*block)(out, out, key);
-                iv = out;
-                if (len<=16) break;
-                len -= 16;
-                in  += 16;
-                out += 16;
-        }
-        memcpy(ivec,iv,16);
-}
-void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block)
-{
-        size_t n;
-        union { size_t t[16/sizeof(size_t)]; unsigned char c[16]; } tmp;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (in != out) {
-                const unsigned char *iv = ivec;
-                if (STRICT_ALIGNMENT &&
-                    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
-                        while (len>=16) {
-                                (*block)(in, out, key);
-                                for(n=0; n<16; ++n)
-                                        out[n] ^= iv[n];
-                                iv = in;
-                                len -= 16;
-                                in  += 16;
-                                out += 16;
-                        }
-                } else if (16%sizeof(size_t) == 0) { /* always true */
-                        while (len>=16) {
-                                size_t *out_t=(size_t *)out, *iv_t=(size_t *)iv;
-                                (*block)(in, out, key);
-                                for(n=0; n<16/sizeof(size_t); n++)
-                                        out_t[n] ^= iv_t[n];
-                                iv = in;
-                                len -= 16;
-                                in  += 16;
-                                out += 16;
-                        }
-                }
-                memcpy(ivec,iv,16);
-        } else {
-                if (STRICT_ALIGNMENT &&
-                    ((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0) {
-                        unsigned char c;
-                        while (len>=16) {
-                                (*block)(in, tmp.c, key);
-                                for(n=0; n<16; ++n) {
-                                        c = in[n];
-                                        out[n] = tmp.c[n] ^ ivec[n];
-                                        ivec[n] = c;
-                                }
-                                len -= 16;
-                                in  += 16;
-                                out += 16;
-                        }
-                } else if (16%sizeof(size_t) == 0) { /* always true */
-                        while (len>=16) {
-                                size_t c, *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
-                                const size_t *in_t=(const size_t *)in;
-                                (*block)(in, tmp.c, key);
-                                for(n=0; n<16/sizeof(size_t); n++) {
-                                        c = in_t[n];
-                                        out_t[n] = tmp.t[n] ^ ivec_t[n];
-                                        ivec_t[n] = c;
-                                }
-                                len -= 16;
-                                in  += 16;
-                                out += 16;
-                        }
-                }
-        }
-#endif
-        while (len) {
-                unsigned char c;
-                (*block)(in, tmp.c, key);
-                for(n=0; n<16 && n<len; ++n) {
-                        c = in[n];
-                        out[n] = tmp.c[n] ^ ivec[n];
-                        ivec[n] = c;
-                }
-                if (len<=16) {
-                        for (; n<16; ++n)
-                                ivec[n] = in[n];
-                        break;
-                }
-                len -= 16;
-                in  += 16;
-                out += 16;
-        }
-}
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
deleted file mode 100644
index 58cc4f44c6..0000000000
--- a/src/lib/libcrypto/modes/ccm128.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/* $OpenBSD: ccm128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-/* First you setup M and L parameters and pass the key schedule.
- * This is called once per session setup... */
-void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
-        unsigned int M,unsigned int L,void *key,block128_f block)
-{
-        memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
-        ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
-        ctx->blocks = 0;
-        ctx->block = block;
-        ctx->key = key;
-}
-/* !!! Following interfaces are to be called *once* per packet !!! */
-/* Then you setup per-message nonce and pass the length of the message */
-int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
-        const unsigned char *nonce,size_t nlen,size_t mlen)
-{
-        unsigned int L = ctx->nonce.c[0]&7;     /* the L parameter */
-        if (nlen<(14-L)) return -1;             /* nonce is too short */
-        if (sizeof(mlen)==8 && L>=3) {
-                ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
-                ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
-                ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
-                ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
-        }
-        else
-                ctx->nonce.u[1] = 0;
-        ctx->nonce.c[12] = (u8)(mlen>>24);
-        ctx->nonce.c[13] = (u8)(mlen>>16);
-        ctx->nonce.c[14] = (u8)(mlen>>8);
-        ctx->nonce.c[15] = (u8)mlen;
-        ctx->nonce.c[0] &= ~0x40;       /* clear Adata flag */
-        memcpy(&ctx->nonce.c[1],nonce,14-L);
-        return 0;
-}
-/* Then you pass additional authentication data, this is optional */
-void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
-        const unsigned char *aad,size_t alen)
-{       unsigned int i;
-        block128_f block = ctx->block;
-        if (alen==0) return;
-        ctx->nonce.c[0] |= 0x40;        /* set Adata flag */
-        (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
-        ctx->blocks++;
-        if (alen<(0x10000-0x100)) {
-                ctx->cmac.c[0] ^= (u8)(alen>>8);
-                ctx->cmac.c[1] ^= (u8)alen;
-                i=2;
-        }
-        else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
-                ctx->cmac.c[0] ^= 0xFF;
-                ctx->cmac.c[1] ^= 0xFF;
-                ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
-                ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
-                ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
-                ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
-                ctx->cmac.c[6] ^= (u8)(alen>>24);
-                ctx->cmac.c[7] ^= (u8)(alen>>16);
-                ctx->cmac.c[8] ^= (u8)(alen>>8);
-                ctx->cmac.c[9] ^= (u8)alen;
-                i=10;
-        }
-        else {
-                ctx->cmac.c[0] ^= 0xFF;
-                ctx->cmac.c[1] ^= 0xFE;
-                ctx->cmac.c[2] ^= (u8)(alen>>24);
-                ctx->cmac.c[3] ^= (u8)(alen>>16);
-                ctx->cmac.c[4] ^= (u8)(alen>>8);
-                ctx->cmac.c[5] ^= (u8)alen;
-                i=6;
-        }
-        do {
-                for(;i<16 && alen;++i,++aad,--alen)
-                        ctx->cmac.c[i] ^= *aad;
-                (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
-                ctx->blocks++;
-                i=0;
-        } while (alen);
-}
-/* Finally you encrypt or decrypt the message */
-/* counter part of nonce may not be larger than L*8 bits,
- * L is not larger than 8, therefore 64-bit counter... */
-static void ctr64_inc(unsigned char *counter) {
-        unsigned int n=8;
-        u8  c;
-        counter += 8;
-        do {
-                --n;
-                c = counter[n];
-                ++c;
-                counter[n] = c;
-                if (c) return;
-        } while (n);
-}
-int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out,
-        size_t len)
-{
-        size_t          n;
-        unsigned int    i,L;
-        unsigned char   flags0  = ctx->nonce.c[0];
-        block128_f      block   = ctx->block;
-        void *          key     = ctx->key;
-        union { u64 u[2]; u8 c[16]; } scratch;
-        if (!(flags0&0x40))
-                (*block)(ctx->nonce.c,ctx->cmac.c,key),
-                ctx->blocks++;
-        ctx->nonce.c[0] = L = flags0&7;
-        for (n=0,i=15-L;i<15;++i) {
-                n |= ctx->nonce.c[i];
-                ctx->nonce.c[i]=0;
-                n <<= 8;
-        }
-        n |= ctx->nonce.c[15];  /* reconstructed length */
-        ctx->nonce.c[15]=1;
-        if (n!=len) return -1;  /* length mismatch */
-        ctx->blocks += ((len+15)>>3)|1;
-        if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
-        while (len>=16) {
-#ifdef __STRICT_ALIGNMENT
-                union { u64 u[2]; u8 c[16]; } temp;
-                memcpy (temp.c,inp,16);
-                ctx->cmac.u[0] ^= temp.u[0];
-                ctx->cmac.u[1] ^= temp.u[1];
-#else
-                ctx->cmac.u[0] ^= ((u64*)inp)[0];
-                ctx->cmac.u[1] ^= ((u64*)inp)[1];
-#endif
-                (*block)(ctx->cmac.c,ctx->cmac.c,key);
-                (*block)(ctx->nonce.c,scratch.c,key);
-                ctr64_inc(ctx->nonce.c);
-#ifdef __STRICT_ALIGNMENT
-                temp.u[0] ^= scratch.u[0];
-                temp.u[1] ^= scratch.u[1];
-                memcpy(out,temp.c,16);
-#else
-                ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
-                ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
-#endif
-                inp += 16;
-                out += 16;
-                len -= 16;
-        }
-        if (len) {
-                for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
-                (*block)(ctx->cmac.c,ctx->cmac.c,key);
-                (*block)(ctx->nonce.c,scratch.c,key);
-                for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
-        }
-        for (i=15-L;i<16;++i)
-                ctx->nonce.c[i]=0;
-        (*block)(ctx->nonce.c,scratch.c,key);
-        ctx->cmac.u[0] ^= scratch.u[0];
-        ctx->cmac.u[1] ^= scratch.u[1];
-        ctx->nonce.c[0] = flags0;
-        return 0;
-}
-int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out,
-        size_t len)
-{
-        size_t          n;
-        unsigned int    i,L;
-        unsigned char   flags0  = ctx->nonce.c[0];
-        block128_f      block   = ctx->block;
-        void *          key     = ctx->key;
-        union { u64 u[2]; u8 c[16]; } scratch;
-        if (!(flags0&0x40))
-                (*block)(ctx->nonce.c,ctx->cmac.c,key);
-        ctx->nonce.c[0] = L = flags0&7;
-        for (n=0,i=15-L;i<15;++i) {
-                n |= ctx->nonce.c[i];
-                ctx->nonce.c[i]=0;
-                n <<= 8;
-        }
-        n |= ctx->nonce.c[15];  /* reconstructed length */
-        ctx->nonce.c[15]=1;
-        if (n!=len) return -1;
-        while (len>=16) {
-#ifdef __STRICT_ALIGNMENT
-                union { u64 u[2]; u8 c[16]; } temp;
-#endif
-                (*block)(ctx->nonce.c,scratch.c,key);
-                ctr64_inc(ctx->nonce.c);
-#ifdef __STRICT_ALIGNMENT
-                memcpy (temp.c,inp,16);
-                ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
-                ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
-                memcpy (out,scratch.c,16);
-#else
-                ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
-                ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
-#endif
-                (*block)(ctx->cmac.c,ctx->cmac.c,key);
-                inp += 16;
-                out += 16;
-                len -= 16;
-        }
-        if (len) {
-                (*block)(ctx->nonce.c,scratch.c,key);
-                for (i=0; i<len; ++i)
-                        ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
-                (*block)(ctx->cmac.c,ctx->cmac.c,key);
-        }
-        for (i=15-L;i<16;++i)
-                ctx->nonce.c[i]=0;
-        (*block)(ctx->nonce.c,scratch.c,key);
-        ctx->cmac.u[0] ^= scratch.u[0];
-        ctx->cmac.u[1] ^= scratch.u[1];
-        ctx->nonce.c[0] = flags0;
-        return 0;
-}
-static void ctr64_add (unsigned char *counter,size_t inc)
-{       size_t n=8, val=0;
-        counter += 8;
-        do {
-                --n;
-                val += counter[n] + (inc&0xff);
-                counter[n] = (unsigned char)val;
-                val >>= 8;      /* carry bit */
-                inc >>= 8;
-        } while(n && (inc || val));
-}
-int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out,
-        size_t len,ccm128_f stream)
-{
-        size_t          n;
-        unsigned int    i,L;
-        unsigned char   flags0  = ctx->nonce.c[0];
-        block128_f      block   = ctx->block;
-        void *          key     = ctx->key;
-        union { u64 u[2]; u8 c[16]; } scratch;
-        if (!(flags0&0x40))
-                (*block)(ctx->nonce.c,ctx->cmac.c,key),
-                ctx->blocks++;
-        ctx->nonce.c[0] = L = flags0&7;
-        for (n=0,i=15-L;i<15;++i) {
-                n |= ctx->nonce.c[i];
-                ctx->nonce.c[i]=0;
-                n <<= 8;
-        }
-        n |= ctx->nonce.c[15];  /* reconstructed length */
-        ctx->nonce.c[15]=1;
-        if (n!=len) return -1;  /* length mismatch */
-        ctx->blocks += ((len+15)>>3)|1;
-        if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */
-        if ((n=len/16)) {
-                (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
-                n   *= 16;
-                inp += n;
-                out += n;
-                len -= n;
-                if (len) ctr64_add(ctx->nonce.c,n/16);
-        }
-        if (len) {
-                for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
-                (*block)(ctx->cmac.c,ctx->cmac.c,key);
-                (*block)(ctx->nonce.c,scratch.c,key);
-                for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
-        }
-        for (i=15-L;i<16;++i)
-                ctx->nonce.c[i]=0;
-        (*block)(ctx->nonce.c,scratch.c,key);
-        ctx->cmac.u[0] ^= scratch.u[0];
-        ctx->cmac.u[1] ^= scratch.u[1];
-        ctx->nonce.c[0] = flags0;
-        return 0;
-}
-int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out,
-        size_t len,ccm128_f stream)
-{
-        size_t          n;
-        unsigned int    i,L;
-        unsigned char   flags0  = ctx->nonce.c[0];
-        block128_f      block   = ctx->block;
-        void *          key     = ctx->key;
-        union { u64 u[2]; u8 c[16]; } scratch;
-        if (!(flags0&0x40))
-                (*block)(ctx->nonce.c,ctx->cmac.c,key);
-        ctx->nonce.c[0] = L = flags0&7;
-        for (n=0,i=15-L;i<15;++i) {
-                n |= ctx->nonce.c[i];
-                ctx->nonce.c[i]=0;
-                n <<= 8;
-        }
-        n |= ctx->nonce.c[15];  /* reconstructed length */
-        ctx->nonce.c[15]=1;
-        if (n!=len) return -1;
-        if ((n=len/16)) {
-                (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
-                n   *= 16;
-                inp += n;
-                out += n;
-                len -= n;
-                if (len) ctr64_add(ctx->nonce.c,n/16);
-        }
-        if (len) {
-                (*block)(ctx->nonce.c,scratch.c,key);
-                for (i=0; i<len; ++i)
-                        ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
-                (*block)(ctx->cmac.c,ctx->cmac.c,key);
-        }
-        for (i=15-L;i<16;++i)
-                ctx->nonce.c[i]=0;
-        (*block)(ctx->nonce.c,scratch.c,key);
-        ctx->cmac.u[0] ^= scratch.u[0];
-        ctx->cmac.u[1] ^= scratch.u[1];
-        ctx->nonce.c[0] = flags0;
-        return 0;
-}
-size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
-{       unsigned int M = (ctx->nonce.c[0]>>3)&7;        /* the M parameter */
-        M *= 2; M += 2;
-        if (len<M)      return 0;
-        memcpy(tag,ctx->cmac.c,M);
-        return M;
-}
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
deleted file mode 100644
index 8399f0c5be..0000000000
--- a/src/lib/libcrypto/modes/cfb128.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/* $OpenBSD: cfb128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-/* The input and output encrypted as though 128bit cfb mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
- */
-void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], int *num,
-                        int enc, block128_f block)
-{
-    unsigned int n;
-    size_t l = 0;
-    n = *num;
-    if (enc) {
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (16%sizeof(size_t) == 0) do {        /* always true actually */
-                while (n && len) {
-                        *(out++) = ivec[n] ^= *(in++);
-                        --len;
-                        n = (n+1) % 16;
-                }
-#ifdef __STRICT_ALIGNMENT
-                if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-                        break;
-#endif
-                while (len>=16) {
-                        (*block)(ivec, ivec, key);
-                        for (; n<16; n+=sizeof(size_t)) {
-                                *(size_t*)(out+n) =
-                                *(size_t*)(ivec+n) ^= *(size_t*)(in+n);
-                        }
-                        len -= 16;
-                        out += 16;
-                        in  += 16;
-                        n = 0;
-                }
-                if (len) {
-                        (*block)(ivec, ivec, key);
-                        while (len--) {
-                                out[n] = ivec[n] ^= in[n];
-                                ++n;
-                        }
-                }
-                *num = n;
-                return;
-        } while (0);
-        /* the rest would be commonly eliminated by x86* compiler */
-#endif
-        while (l<len) {
-                if (n == 0) {
-                        (*block)(ivec, ivec, key);
-                }
-                out[l] = ivec[n] ^= in[l];
-                ++l;
-                n = (n+1) % 16;
-        }
-        *num = n;
-    } else {
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (16%sizeof(size_t) == 0) do {        /* always true actually */
-                while (n && len) {
-                        unsigned char c;
-                        *(out++) = ivec[n] ^ (c = *(in++)); ivec[n] = c;
-                        --len;
-                        n = (n+1) % 16;
-                }
-#ifdef __STRICT_ALIGNMENT
-                if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-                        break;
-#endif
-                while (len>=16) {
-                        (*block)(ivec, ivec, key);
-                        for (; n<16; n+=sizeof(size_t)) {
-                                size_t t = *(size_t*)(in+n);
-                                *(size_t*)(out+n) = *(size_t*)(ivec+n) ^ t;
-                                *(size_t*)(ivec+n) = t;
-                        }
-                        len -= 16;
-                        out += 16;
-                        in  += 16;
-                        n = 0;
-                }
-                if (len) {
-                        (*block)(ivec, ivec, key);
-                        while (len--) {
-                                unsigned char c;
-                                out[n] = ivec[n] ^ (c = in[n]); ivec[n] = c;
-                                ++n;
-                        }
-                }
-                *num = n;
-                return;
-        } while (0);
-        /* the rest would be commonly eliminated by x86* compiler */
-#endif
-        while (l<len) {
-                unsigned char c;
-                if (n == 0) {
-                        (*block)(ivec, ivec, key);
-                }
-                out[l] = ivec[n] ^ (c = in[l]); ivec[n] = c;
-                ++l;
-                n = (n+1) % 16;
-        }
-        *num=n;
-    }
-}
-/* This expects a single block of size nbits for both in and out. Note that
-   it corrupts any extra bits in the last byte of out */
-static void cfbr_encrypt_block(const unsigned char *in,unsigned char *out,
-                            int nbits,const void *key,
-                            unsigned char ivec[16],int enc,
-                            block128_f block)
-{
-    int n,rem,num;
-    unsigned char ovec[16*2 + 1];  /* +1 because we dererefence (but don't use) one byte off the end */
-    if (nbits<=0 || nbits>128) return;
-        /* fill in the first half of the new IV with the current IV */
-        memcpy(ovec,ivec,16);
-        /* construct the new IV */
-        (*block)(ivec,ivec,key);
-        num = (nbits+7)/8;
-        if (enc)        /* encrypt the input */
-            for(n=0 ; n < num ; ++n)
-                out[n] = (ovec[16+n] = in[n] ^ ivec[n]);
-        else            /* decrypt the input */
-            for(n=0 ; n < num ; ++n)
-                out[n] = (ovec[16+n] = in[n]) ^ ivec[n];
-        /* shift ovec left... */
-        rem = nbits%8;
-        num = nbits/8;
-        if(rem==0)
-            memcpy(ivec,ovec+num,16);
-        else
-            for(n=0 ; n < 16 ; ++n)
-                ivec[n] = ovec[n+num]<<rem | ovec[n+num+1]>>(8-rem);
-    /* it is not necessary to cleanse ovec, since the IV is not secret */
-}
-/* N.B. This expects the input to be packed, MS bit first */
-void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t bits, const void *key,
-                        unsigned char ivec[16], int *num,
-                        int enc, block128_f block)
-{
-    size_t n;
-    unsigned char c[1],d[1];
-    for(n=0 ; n<bits ; ++n)
-        {
-        c[0]=(in[n/8]&(1 << (7-n%8))) ? 0x80 : 0;
-        cfbr_encrypt_block(c,d,1,key,ivec,enc,block);
-        out[n/8]=(out[n/8]&~(1 << (unsigned int)(7-n%8))) |
-                 ((d[0]&0x80) >> (unsigned int)(n%8));
-        }
-}
-void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t length, const void *key,
-                        unsigned char ivec[16], int *num,
-                        int enc, block128_f block)
-{
-    size_t n;
-    for(n=0 ; n<length ; ++n)
-        cfbr_encrypt_block(&in[n],&out[n],8,key,ivec,enc,block);
-}
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
deleted file mode 100644
index 7fd0223701..0000000000
--- a/src/lib/libcrypto/modes/ctr128.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/* $OpenBSD: ctr128.c,v 1.6 2015/02/10 09:46:30 miod Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-/* NOTE: the IV/counter CTR mode is big-endian.  The code itself
- * is endian-neutral. */
-/* increment counter (128-bit int) by 1 */
-static void ctr128_inc(unsigned char *counter) {
-        u32 n=16;
-        u8  c;
-        do {
-                --n;
-                c = counter[n];
-                ++c;
-                counter[n] = c;
-                if (c) return;
-        } while (n);
-}
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-static void
-ctr128_inc_aligned(unsigned char *counter)
-{
-        size_t *data,c,n;
-        if (BYTE_ORDER == LITTLE_ENDIAN) {
-                ctr128_inc(counter);
-                return;
-        }
-        data = (size_t *)counter;
-        n = 16/sizeof(size_t);
-        do {
-                --n;
-                c = data[n];
-                ++c;
-                data[n] = c;
-                if (c) return;
-        } while (n);
-}
-#endif
-/* The input encrypted as though 128bit counter mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num, and the
- * encrypted counter is kept in ecount_buf.  Both *num and
- * ecount_buf must be initialised with zeros before the first
- * call to CRYPTO_ctr128_encrypt().
- *
- * This algorithm assumes that the counter is in the x lower bits
- * of the IV (ivec), and that the application has full control over
- * overflow and the rest of the IV.  This implementation takes NO
- * responsability for checking that the counter doesn't overflow
- * into the rest of the IV when incremented.
- */
-void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], unsigned char ecount_buf[16],
-                        unsigned int *num, block128_f block)
-{
-        unsigned int n;
-        size_t l=0;
-        assert(*num < 16);
-        n = *num;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (16%sizeof(size_t) == 0) do { /* always true actually */
-                while (n && len) {
-                        *(out++) = *(in++) ^ ecount_buf[n];
-                        --len;
-                        n = (n+1) % 16;
-                }
-#ifdef __STRICT_ALIGNMENT
-                if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-                        break;
-#endif
-                while (len>=16) {
-                        (*block)(ivec, ecount_buf, key);
-                        ctr128_inc_aligned(ivec);
-                        for (; n<16; n+=sizeof(size_t))
-                                *(size_t *)(out+n) =
-                                *(size_t *)(in+n) ^ *(size_t *)(ecount_buf+n);
-                        len -= 16;
-                        out += 16;
-                        in  += 16;
-                        n = 0;
-                }
-                if (len) {
-                        (*block)(ivec, ecount_buf, key);
-                        ctr128_inc_aligned(ivec);
-                        while (len--) {
-                                out[n] = in[n] ^ ecount_buf[n];
-                                ++n;
-                        }
-                }
-                *num = n;
-                return;
-        } while(0);
-        /* the rest would be commonly eliminated by x86* compiler */
-#endif
-        while (l<len) {
-                if (n==0) {
-                        (*block)(ivec, ecount_buf, key);
-                        ctr128_inc(ivec);
-                }
-                out[l] = in[l] ^ ecount_buf[n];
-                ++l;
-                n = (n+1) % 16;
-        }
-        *num=n;
-}
-/* increment upper 96 bits of 128-bit counter by 1 */
-static void ctr96_inc(unsigned char *counter) {
-        u32 n=12;
-        u8  c;
-        do {
-                --n;
-                c = counter[n];
-                ++c;
-                counter[n] = c;
-                if (c) return;
-        } while (n);
-}
-void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], unsigned char ecount_buf[16],
-                        unsigned int *num, ctr128_f func)
-{
-        unsigned int n,ctr32;
-        assert(*num < 16);
-        n = *num;
-        while (n && len) {
-                *(out++) = *(in++) ^ ecount_buf[n];
-                --len;
-                n = (n+1) % 16;
-        }
-        ctr32 = GETU32(ivec+12);
-        while (len>=16) {
-                size_t blocks = len/16;
-                /*
-                 * 1<<28 is just a not-so-small yet not-so-large number...
-                 * Below condition is practically never met, but it has to
-                 * be checked for code correctness.
-                 */
-                if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
-                        blocks = (1U<<28);
-                /*
-                 * As (*func) operates on 32-bit counter, caller
-                 * has to handle overflow. 'if' below detects the
-                 * overflow, which is then handled by limiting the
-                 * amount of blocks to the exact overflow point...
-                 */
-                ctr32 += (u32)blocks;
-                if (ctr32 < blocks) {
-                        blocks -= ctr32;
-                        ctr32   = 0;
-                }
-                (*func)(in,out,blocks,key,ivec);
-                /* (*ctr) does not update ivec, caller does: */
-                PUTU32(ivec+12,ctr32);
-                /* ... overflow was detected, propogate carry. */
-                if (ctr32 == 0) ctr96_inc(ivec);
-                blocks *= 16;
-                len -= blocks;
-                out += blocks;
-                in  += blocks;
-        }
-        if (len) {
-                memset(ecount_buf,0,16);
-                (*func)(ecount_buf,ecount_buf,1,key,ivec);
-                ++ctr32;
-                PUTU32(ivec+12,ctr32);
-                if (ctr32 == 0) ctr96_inc(ivec);
-                while (len--) {
-                        out[n] = in[n] ^ ecount_buf[n];
-                        ++n;
-                }
-        }
-        *num=n;
-}
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c
deleted file mode 100644
index 802aa77cd5..0000000000
--- a/src/lib/libcrypto/modes/cts128.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/* $OpenBSD: cts128.c,v 1.5 2015/07/19 18:27:26 miod Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Rights for redistribution and usage in source and binary
- * forms are granted according to the OpenSSL license.
- */
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-/*
- * Trouble with Ciphertext Stealing, CTS, mode is that there is no
- * common official specification, but couple of cipher/application
- * specific ones: RFC2040 and RFC3962. Then there is 'Proposal to
- * Extend CBC Mode By "Ciphertext Stealing"' at NIST site, which
- * deviates from mentioned RFCs. Most notably it allows input to be
- * of block length and it doesn't flip the order of the last two
- * blocks. CTS is being discussed even in ECB context, but it's not
- * adopted for any known application. This implementation provides
- * two interfaces: one compliant with above mentioned RFCs and one
- * compliant with the NIST proposal, both extending CBC mode.
- */
-size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block)
-{       size_t residue, n;
-        if (len <= 16) return 0;
-        if ((residue=len%16) == 0) residue = 16;
-        len -= residue;
-        CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
-        in  += len;
-        out += len;
-        for (n=0; n<residue; ++n)
-                ivec[n] ^= in[n];
-        (*block)(ivec,ivec,key);
-        memcpy(out,out-16,residue);
-        memcpy(out-16,ivec,16); 
-        return len+residue;
-}
-size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block)
-{       size_t residue, n;
-        if (len < 16) return 0;
-        residue=len%16;
-        len -= residue;
-        CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
-        if (residue==0) return len;
-        in  += len;
-        out += len;
-        for (n=0; n<residue; ++n)
-                ivec[n] ^= in[n];
-        (*block)(ivec,ivec,key);
-        memcpy(out-16+residue,ivec,16);
-        return len+residue;
-}
-size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc)
-{       size_t residue;
-        union { size_t align; unsigned char c[16]; } tmp;
-        if (len <= 16) return 0;
-        if ((residue=len%16) == 0) residue = 16;
-        len -= residue;
-        (*cbc)(in,out,len,key,ivec,1);
-        in  += len;
-        out += len;
-        memset(tmp.c,0,sizeof(tmp));
-        memcpy(tmp.c,in,residue);
-        memcpy(out,out-16,residue);
-        (*cbc)(tmp.c,out-16,16,key,ivec,1);
-        return len+residue;
-}
-size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc)
-{       size_t residue;
-        union { size_t align; unsigned char c[16]; } tmp;
-        if (len < 16) return 0;
-        residue=len%16;
-        len -= residue;
-        (*cbc)(in,out,len,key,ivec,1);
-        if (residue==0) return len;
-        in  += len;
-        out += len;
-        memset(tmp.c,0,sizeof(tmp));
-        memcpy(tmp.c,in,residue);
-        (*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
-        return len+residue;
-}
-size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block)
-{       size_t residue, n;
-        union { size_t align; unsigned char c[32]; } tmp;
-        if (len<=16) return 0;
-        if ((residue=len%16) == 0) residue = 16;
-        len -= 16+residue;
-        if (len) {
-                CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
-                in  += len;
-                out += len;
-        }
-        (*block)(in,tmp.c+16,key);
-        memcpy(tmp.c,tmp.c+16,16);
-        memcpy(tmp.c,in+16,residue);
-        (*block)(tmp.c,tmp.c,key);
-        for(n=0; n<16; ++n) {
-                unsigned char c = in[n];
-                out[n] = tmp.c[n] ^ ivec[n];
-                ivec[n] = c;
-        }
-        for(residue+=16; n<residue; ++n)
-                out[n] = tmp.c[n] ^ in[n];
-        return 16+len+residue;
-}
-size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block)
-{       size_t residue, n;
-        union { size_t align; unsigned char c[32]; } tmp;
-        if (len<16) return 0;
-        residue=len%16;
-        if (residue==0) {
-                CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
-                return len;
-        }
-        len -= 16+residue;
-        if (len) {
-                CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
-                in  += len;
-                out += len;
-        }
-        (*block)(in+residue,tmp.c+16,key);
-        memcpy(tmp.c,tmp.c+16,16);
-        memcpy(tmp.c,in,residue);
-        (*block)(tmp.c,tmp.c,key);
-        for(n=0; n<16; ++n) {
-                unsigned char c = in[n];
-                out[n] = tmp.c[n] ^ ivec[n];
-                ivec[n] = in[n+residue];
-                tmp.c[n] = c;
-        }
-        for(residue+=16; n<residue; ++n)
-                out[n] = tmp.c[n] ^ tmp.c[n-16];
-        return 16+len+residue;
-}
-size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc)
-{       size_t residue;
-        union { size_t align; unsigned char c[32]; } tmp;
-        if (len<=16) return 0;
-        if ((residue=len%16) == 0) residue = 16;
-        len -= 16+residue;
-        if (len) {
-                (*cbc)(in,out,len,key,ivec,0);
-                in  += len;
-                out += len;
-        }
-        memset(tmp.c,0,sizeof(tmp));
-        /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
-        (*cbc)(in,tmp.c,16,key,tmp.c+16,0);
-        memcpy(tmp.c,in+16,residue);
-        (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
-        memcpy(out,tmp.c,16+residue);
-        return 16+len+residue;
-}
-size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc)
-{       size_t residue;
-        union { size_t align; unsigned char c[32]; } tmp;
-        if (len<16) return 0;
-        residue=len%16;
-        if (residue==0) {
-                (*cbc)(in,out,len,key,ivec,0);
-                return len;
-        }
-        len -= 16+residue;
-        if (len) {
-                (*cbc)(in,out,len,key,ivec,0);
-                in  += len;
-                out += len;
-        }
-        memset(tmp.c,0,sizeof(tmp));
-        /* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
-        (*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
-        memcpy(tmp.c,in,residue);
-        (*cbc)(tmp.c,tmp.c,32,key,ivec,0);
-        memcpy(out,tmp.c,16+residue);
-        return 16+len+residue;
-}
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
deleted file mode 100644
index dd6d91e880..0000000000
--- a/src/lib/libcrypto/modes/gcm128.c
+++ /dev/null
@@ -1,1539 +0,0 @@
-/* $OpenBSD: gcm128.c,v 1.13 2015/09/10 15:56:25 jsing Exp $ */
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-#define OPENSSL_FIPSAPI
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
-/* redefine, because alignment is ensured */
-#undef  GETU32
-#define GETU32(p)       BSWAP4(*(const u32 *)(p))
-#undef  PUTU32
-#define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
-#endif
-#define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
-#define REDUCE1BIT(V)   \
-        do { \
-                if (sizeof(size_t)==8) { \
-                        u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
-                        V.lo  = (V.hi<<63)|(V.lo>>1); \
-                        V.hi  = (V.hi>>1 )^T; \
-                } else { \
-                        u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
-                        V.lo  = (V.hi<<63)|(V.lo>>1); \
-                        V.hi  = (V.hi>>1 )^((u64)T<<32); \
-                } \
-        } while(0)
-/*
- * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
- * never be set to 8. 8 is effectively reserved for testing purposes.
- * TABLE_BITS>1 are lookup-table-driven implementations referred to as
- * "Shoup's" in GCM specification. In other words OpenSSL does not cover
- * whole spectrum of possible table driven implementations. Why? In
- * non-"Shoup's" case memory access pattern is segmented in such manner,
- * that it's trivial to see that cache timing information can reveal
- * fair portion of intermediate hash value. Given that ciphertext is
- * always available to attacker, it's possible for him to attempt to
- * deduce secret parameter H and if successful, tamper with messages
- * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
- * not as trivial, but there is no reason to believe that it's resistant
- * to cache-timing attack. And the thing about "8-bit" implementation is
- * that it consumes 16 (sixteen) times more memory, 4KB per individual
- * key + 1KB shared. Well, on pros side it should be twice as fast as
- * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
- * was observed to run ~75% faster, closer to 100% for commercial
- * compilers... Yet "4-bit" procedure is preferred, because it's
- * believed to provide better security-performance balance and adequate
- * all-round performance. "All-round" refers to things like:
- *
- * - shorter setup time effectively improves overall timing for
- *   handling short messages;
- * - larger table allocation can become unbearable because of VM
- *   subsystem penalties (for example on Windows large enough free
- *   results in VM working set trimming, meaning that consequent
- *   malloc would immediately incur working set expansion);
- * - larger table has larger cache footprint, which can affect
- *   performance of other code paths (not necessarily even from same
- *   thread in Hyper-Threading world);
- *
- * Value of 1 is not appropriate for performance reasons.
- */
-#if     TABLE_BITS==8
-static void gcm_init_8bit(u128 Htable[256], u64 H[2])
-{
-        int  i, j;
-        u128 V;
-        Htable[0].hi = 0;
-        Htable[0].lo = 0;
-        V.hi = H[0];
-        V.lo = H[1];
-        for (Htable[128]=V, i=64; i>0; i>>=1) {
-                REDUCE1BIT(V);
-                Htable[i] = V;
-        }
-        for (i=2; i<256; i<<=1) {
-                u128 *Hi = Htable+i, H0 = *Hi;
-                for (j=1; j<i; ++j) {
-                        Hi[j].hi = H0.hi^Htable[j].hi;
-                        Hi[j].lo = H0.lo^Htable[j].lo;
-                }
-        }
-}
-static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
-{
-        u128 Z = { 0, 0};
-        const u8 *xi = (const u8 *)Xi+15;
-        size_t rem, n = *xi;
-        static const size_t rem_8bit[256] = {
-                PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
-                PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
-                PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
-                PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
-                PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
-                PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
-                PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
-                PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
-                PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
-                PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
-                PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
-                PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
-                PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
-                PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
-                PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
-                PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
-                PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
-                PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
-                PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
-                PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
-                PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
-                PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
-                PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
-                PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
-                PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
-                PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
-                PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
-                PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
-                PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
-                PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
-                PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
-                PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
-                PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
-                PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
-                PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
-                PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
-                PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
-                PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
-                PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
-                PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
-                PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
-                PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
-                PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
-                PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
-                PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
-                PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
-                PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
-                PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
-                PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
-                PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
-                PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
-                PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
-                PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
-                PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
-                PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
-                PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
-                PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
-                PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
-                PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
-                PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
-                PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
-                PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
-                PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
-                PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
-        while (1) {
-                Z.hi ^= Htable[n].hi;
-                Z.lo ^= Htable[n].lo;
-                if ((u8 *)Xi==xi)       break;
-                n = *(--xi);
-                rem  = (size_t)Z.lo&0xff;
-                Z.lo = (Z.hi<<56)|(Z.lo>>8);
-                Z.hi = (Z.hi>>8);
-                if (sizeof(size_t)==8)
-                        Z.hi ^= rem_8bit[rem];
-                else
-                        Z.hi ^= (u64)rem_8bit[rem]<<32;
-        }
-        if (BYTE_ORDER == LITTLE_ENDIAN) {
-#ifdef BSWAP8
-                Xi[0] = BSWAP8(Z.hi);
-                Xi[1] = BSWAP8(Z.lo);
-#else
-                u8 *p = (u8 *)Xi;
-                u32 v;
-                v = (u32)(Z.hi>>32);    PUTU32(p,v);
-                v = (u32)(Z.hi);        PUTU32(p+4,v);
-                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
-                v = (u32)(Z.lo);        PUTU32(p+12,v);
-#endif
-        }
-        else {
-                Xi[0] = Z.hi;
-                Xi[1] = Z.lo;
-        }
-}
-#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
-#elif   TABLE_BITS==4
-static void gcm_init_4bit(u128 Htable[16], u64 H[2])
-{
-        u128 V;
-#if defined(OPENSSL_SMALL_FOOTPRINT)
-        int  i;
-#endif
-        Htable[0].hi = 0;
-        Htable[0].lo = 0;
-        V.hi = H[0];
-        V.lo = H[1];
-#if defined(OPENSSL_SMALL_FOOTPRINT)
-        for (Htable[8]=V, i=4; i>0; i>>=1) {
-                REDUCE1BIT(V);
-                Htable[i] = V;
-        }
-        for (i=2; i<16; i<<=1) {
-                u128 *Hi = Htable+i;
-                int   j;
-                for (V=*Hi, j=1; j<i; ++j) {
-                        Hi[j].hi = V.hi^Htable[j].hi;
-                        Hi[j].lo = V.lo^Htable[j].lo;
-                }
-        }
-#else
-        Htable[8] = V;
-        REDUCE1BIT(V);
-        Htable[4] = V;
-        REDUCE1BIT(V);
-        Htable[2] = V;
-        REDUCE1BIT(V);
-        Htable[1] = V;
-        Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
-        V=Htable[4];
-        Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
-        Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
-        Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
-        V=Htable[8];
-        Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
-        Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
-        Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
-        Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
-        Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
-        Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
-        Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
-#endif
-#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
-        /*
-         * ARM assembler expects specific dword order in Htable.
-         */
-        {
-        int j;
-        if (BYTE_ORDER == LITTLE_ENDIAN)
-                for (j=0;j<16;++j) {
-                        V = Htable[j];
-                        Htable[j].hi = V.lo;
-                        Htable[j].lo = V.hi;
-                }
-        else
-                for (j=0;j<16;++j) {
-                        V = Htable[j];
-                        Htable[j].hi = V.lo<<32|V.lo>>32;
-                        Htable[j].lo = V.hi<<32|V.hi>>32;
-                }
-        }
-#endif
-}
-#ifndef GHASH_ASM
-static const size_t rem_4bit[16] = {
-        PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
-        PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
-        PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
-        PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
-static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
-{
-        u128 Z;
-        int cnt = 15;
-        size_t rem, nlo, nhi;
-        nlo  = ((const u8 *)Xi)[15];
-        nhi  = nlo>>4;
-        nlo &= 0xf;
-        Z.hi = Htable[nlo].hi;
-        Z.lo = Htable[nlo].lo;
-        while (1) {
-                rem  = (size_t)Z.lo&0xf;
-                Z.lo = (Z.hi<<60)|(Z.lo>>4);
-                Z.hi = (Z.hi>>4);
-                if (sizeof(size_t)==8)
-                        Z.hi ^= rem_4bit[rem];
-                else
-                        Z.hi ^= (u64)rem_4bit[rem]<<32;
-                Z.hi ^= Htable[nhi].hi;
-                Z.lo ^= Htable[nhi].lo;
-                if (--cnt<0)            break;
-                nlo  = ((const u8 *)Xi)[cnt];
-                nhi  = nlo>>4;
-                nlo &= 0xf;
-                rem  = (size_t)Z.lo&0xf;
-                Z.lo = (Z.hi<<60)|(Z.lo>>4);
-                Z.hi = (Z.hi>>4);
-                if (sizeof(size_t)==8)
-                        Z.hi ^= rem_4bit[rem];
-                else
-                        Z.hi ^= (u64)rem_4bit[rem]<<32;
-                Z.hi ^= Htable[nlo].hi;
-                Z.lo ^= Htable[nlo].lo;
-        }
-        if (BYTE_ORDER == LITTLE_ENDIAN) {
-#ifdef BSWAP8
-                Xi[0] = BSWAP8(Z.hi);
-                Xi[1] = BSWAP8(Z.lo);
-#else
-                u8 *p = (u8 *)Xi;
-                u32 v;
-                v = (u32)(Z.hi>>32);    PUTU32(p,v);
-                v = (u32)(Z.hi);        PUTU32(p+4,v);
-                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
-                v = (u32)(Z.lo);        PUTU32(p+12,v);
-#endif
-        }
-        else {
-                Xi[0] = Z.hi;
-                Xi[1] = Z.lo;
-        }
-}
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-/*
- * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
- * details... Compiler-generated code doesn't seem to give any
- * performance improvement, at least not on x86[_64]. It's here
- * mostly as reference and a placeholder for possible future
- * non-trivial optimization[s]...
- */
-static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
-                                const u8 *inp,size_t len)
-{
-    u128 Z;
-    int cnt;
-    size_t rem, nlo, nhi;
-#if 1
-    do {
-        cnt  = 15;
-        nlo  = ((const u8 *)Xi)[15];
-        nlo ^= inp[15];
-        nhi  = nlo>>4;
-        nlo &= 0xf;
-        Z.hi = Htable[nlo].hi;
-        Z.lo = Htable[nlo].lo;
-        while (1) {
-                rem  = (size_t)Z.lo&0xf;
-                Z.lo = (Z.hi<<60)|(Z.lo>>4);
-                Z.hi = (Z.hi>>4);
-                if (sizeof(size_t)==8)
-                        Z.hi ^= rem_4bit[rem];
-                else
-                        Z.hi ^= (u64)rem_4bit[rem]<<32;
-                Z.hi ^= Htable[nhi].hi;
-                Z.lo ^= Htable[nhi].lo;
-                if (--cnt<0)            break;
-                nlo  = ((const u8 *)Xi)[cnt];
-                nlo ^= inp[cnt];
-                nhi  = nlo>>4;
-                nlo &= 0xf;
-                rem  = (size_t)Z.lo&0xf;
-                Z.lo = (Z.hi<<60)|(Z.lo>>4);
-                Z.hi = (Z.hi>>4);
-                if (sizeof(size_t)==8)
-                        Z.hi ^= rem_4bit[rem];
-                else
-                        Z.hi ^= (u64)rem_4bit[rem]<<32;
-                Z.hi ^= Htable[nlo].hi;
-                Z.lo ^= Htable[nlo].lo;
-        }
-#else
-    /*
-     * Extra 256+16 bytes per-key plus 512 bytes shared tables
-     * [should] give ~50% improvement... One could have PACK()-ed
-     * the rem_8bit even here, but the priority is to minimize
-     * cache footprint...
-     */ 
-    u128 Hshr4[16];     /* Htable shifted right by 4 bits */
-    u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
-    static const unsigned short rem_8bit[256] = {
-        0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
-        0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
-        0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
-        0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
-        0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
-        0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
-        0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
-        0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
-        0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
-        0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
-        0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
-        0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
-        0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
-        0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
-        0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
-        0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
-        0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
-        0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
-        0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
-        0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
-        0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
-        0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
-        0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
-        0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
-        0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
-        0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
-        0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
-        0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
-        0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
-        0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
-        0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
-        0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
-    /*
-     * This pre-processing phase slows down procedure by approximately
-     * same time as it makes each loop spin faster. In other words
-     * single block performance is approximately same as straightforward
-     * "4-bit" implementation, and then it goes only faster...
-     */
-    for (cnt=0; cnt<16; ++cnt) {
-        Z.hi = Htable[cnt].hi;
-        Z.lo = Htable[cnt].lo;
-        Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
-        Hshr4[cnt].hi = (Z.hi>>4);
-        Hshl4[cnt]    = (u8)(Z.lo<<4);
-    }
-    do {
-        for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
-                nlo  = ((const u8 *)Xi)[cnt];
-                nlo ^= inp[cnt];
-                nhi  = nlo>>4;
-                nlo &= 0xf;
-                Z.hi ^= Htable[nlo].hi;
-                Z.lo ^= Htable[nlo].lo;
-                rem = (size_t)Z.lo&0xff;
-                Z.lo = (Z.hi<<56)|(Z.lo>>8);
-                Z.hi = (Z.hi>>8);
-                Z.hi ^= Hshr4[nhi].hi;
-                Z.lo ^= Hshr4[nhi].lo;
-                Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
-        }
-        nlo  = ((const u8 *)Xi)[0];
-        nlo ^= inp[0];
-        nhi  = nlo>>4;
-        nlo &= 0xf;
-        Z.hi ^= Htable[nlo].hi;
-        Z.lo ^= Htable[nlo].lo;
-        rem = (size_t)Z.lo&0xf;
-        Z.lo = (Z.hi<<60)|(Z.lo>>4);
-        Z.hi = (Z.hi>>4);
-        Z.hi ^= Htable[nhi].hi;
-        Z.lo ^= Htable[nhi].lo;
-        Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
-#endif
-        if (BYTE_ORDER == LITTLE_ENDIAN) {
-#ifdef BSWAP8
-                Xi[0] = BSWAP8(Z.hi);
-                Xi[1] = BSWAP8(Z.lo);
-#else
-                u8 *p = (u8 *)Xi;
-                u32 v;
-                v = (u32)(Z.hi>>32);    PUTU32(p,v);
-                v = (u32)(Z.hi);        PUTU32(p+4,v);
-                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
-                v = (u32)(Z.lo);        PUTU32(p+12,v);
-#endif
-        }
-        else {
-                Xi[0] = Z.hi;
-                Xi[1] = Z.lo;
-        }
-    } while (inp+=16, len-=16);
-}
-#endif
-#else
-void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#endif
-#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
-#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
-/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
- * trashing effect. In other words idea is to hash data while it's
- * still in L1 cache after encryption pass... */
-#define GHASH_CHUNK       (3*1024)
-#endif
-#else   /* TABLE_BITS */
-static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
-{
-        u128 V,Z = { 0,0 };
-        long X;
-        int  i,j;
-        const long *xi = (const long *)Xi;
-        V.hi = H[0];    /* H is in host byte order, no byte swapping */
-        V.lo = H[1];
-        for (j=0; j<16/sizeof(long); ++j) {
-                if (BYTE_ORDER == LITTLE_ENDIAN) {
-                        if (sizeof(long)==8) {
-#ifdef BSWAP8
-                                X = (long)(BSWAP8(xi[j]));
-#else
-                                const u8 *p = (const u8 *)(xi+j);
-                                X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
-#endif
-                        }
-                        else {
-                                const u8 *p = (const u8 *)(xi+j);
-                                X = (long)GETU32(p);
-                        }
-                }
-                else
-                        X = xi[j];
-                for (i=0; i<8*sizeof(long); ++i, X<<=1) {
-                        u64 M = (u64)(X>>(8*sizeof(long)-1));
-                        Z.hi ^= V.hi&M;
-                        Z.lo ^= V.lo&M;
-                        REDUCE1BIT(V);
-                }
-        }
-        if (BYTE_ORDER == LITTLE_ENDIAN) {
-#ifdef BSWAP8
-                Xi[0] = BSWAP8(Z.hi);
-                Xi[1] = BSWAP8(Z.lo);
-#else
-                u8 *p = (u8 *)Xi;
-                u32 v;
-                v = (u32)(Z.hi>>32);    PUTU32(p,v);
-                v = (u32)(Z.hi);        PUTU32(p+4,v);
-                v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
-                v = (u32)(Z.lo);        PUTU32(p+12,v);
-#endif
-        }
-        else {
-                Xi[0] = Z.hi;
-                Xi[1] = Z.lo;
-        }
-}
-#define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
-#endif
-#if     TABLE_BITS==4 && defined(GHASH_ASM)
-# if    !defined(I386_ONLY) && \
-        (defined(__i386)        || defined(__i386__)    || \
-         defined(__x86_64)      || defined(__x86_64__)  || \
-         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
-#  define GHASH_ASM_X86_OR_64
-#  define GCM_FUNCREF_4BIT
-extern unsigned int OPENSSL_ia32cap_P[2];
-void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
-void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
-#   define GHASH_ASM_X86
-void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#  endif
-# elif defined(__arm__) || defined(__arm)
-#  include "arm_arch.h"
-#  if __ARM_ARCH__>=7
-#   define GHASH_ASM_ARM
-#   define GCM_FUNCREF_4BIT
-void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
-void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#  endif
-# endif
-#endif
-#ifdef GCM_FUNCREF_4BIT
-# undef  GCM_MUL
-# define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
-# ifdef GHASH
-#  undef  GHASH
-#  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
-# endif
-#endif
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
-{
-        memset(ctx,0,sizeof(*ctx));
-        ctx->block = block;
-        ctx->key   = key;
-        (*block)(ctx->H.c,ctx->H.c,key);
-        if (BYTE_ORDER == LITTLE_ENDIAN) {
-                /* H is stored in host byte order */
-#ifdef BSWAP8
-                ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
-                ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
-#else
-                u8 *p = ctx->H.c;
-                u64 hi,lo;
-                hi = (u64)GETU32(p)  <<32|GETU32(p+4);
-                lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
-                ctx->H.u[0] = hi;
-                ctx->H.u[1] = lo;
-#endif
-        }
-#if     TABLE_BITS==8
-        gcm_init_8bit(ctx->Htable,ctx->H.u);
-#elif   TABLE_BITS==4
-# if    defined(GHASH_ASM_X86_OR_64)
-#  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
-        if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
-            OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
-                gcm_init_clmul(ctx->Htable,ctx->H.u);
-                ctx->gmult = gcm_gmult_clmul;
-                ctx->ghash = gcm_ghash_clmul;
-                return;
-        }
-#  endif
-        gcm_init_4bit(ctx->Htable,ctx->H.u);
-#  if   defined(GHASH_ASM_X86)                  /* x86 only */
-#   if  defined(OPENSSL_IA32_SSE2)
-        if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
-#   else
-        if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
-#   endif
-                ctx->gmult = gcm_gmult_4bit_mmx;
-                ctx->ghash = gcm_ghash_4bit_mmx;
-        } else {
-                ctx->gmult = gcm_gmult_4bit_x86;
-                ctx->ghash = gcm_ghash_4bit_x86;
-        }
-#  else
-        ctx->gmult = gcm_gmult_4bit;
-        ctx->ghash = gcm_ghash_4bit;
-#  endif
-# elif  defined(GHASH_ASM_ARM)
-        if (OPENSSL_armcap_P & ARMV7_NEON) {
-                ctx->gmult = gcm_gmult_neon;
-                ctx->ghash = gcm_ghash_neon;
-        } else {
-                gcm_init_4bit(ctx->Htable,ctx->H.u);
-                ctx->gmult = gcm_gmult_4bit;
-                ctx->ghash = gcm_ghash_4bit;
-        }
-# else
-        gcm_init_4bit(ctx->Htable,ctx->H.u);
-# endif
-#endif
-}
-void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
-{
-        unsigned int ctr;
-#ifdef GCM_FUNCREF_4BIT
-        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
-#endif
-        ctx->Yi.u[0]  = 0;
-        ctx->Yi.u[1]  = 0;
-        ctx->Xi.u[0]  = 0;
-        ctx->Xi.u[1]  = 0;
-        ctx->len.u[0] = 0;      /* AAD length */
-        ctx->len.u[1] = 0;      /* message length */
-        ctx->ares = 0;
-        ctx->mres = 0;
-        if (len==12) {
-                memcpy(ctx->Yi.c,iv,12);
-                ctx->Yi.c[15]=1;
-                ctr=1;
-        }
-        else {
-                size_t i;
-                u64 len0 = len;
-                while (len>=16) {
-                        for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
-                        GCM_MUL(ctx,Yi);
-                        iv += 16;
-                        len -= 16;
-                }
-                if (len) {
-                        for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
-                        GCM_MUL(ctx,Yi);
-                }
-                len0 <<= 3;
-                if (BYTE_ORDER == LITTLE_ENDIAN) {
-#ifdef BSWAP8
-                        ctx->Yi.u[1]  ^= BSWAP8(len0);
-#else
-                        ctx->Yi.c[8]  ^= (u8)(len0>>56);
-                        ctx->Yi.c[9]  ^= (u8)(len0>>48);
-                        ctx->Yi.c[10] ^= (u8)(len0>>40);
-                        ctx->Yi.c[11] ^= (u8)(len0>>32);
-                        ctx->Yi.c[12] ^= (u8)(len0>>24);
-                        ctx->Yi.c[13] ^= (u8)(len0>>16);
-                        ctx->Yi.c[14] ^= (u8)(len0>>8);
-                        ctx->Yi.c[15] ^= (u8)(len0);
-#endif
-                }
-                else
-                        ctx->Yi.u[1]  ^= len0;
-                GCM_MUL(ctx,Yi);
-                if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                        ctr = BSWAP4(ctx->Yi.d[3]);
-#else
-                        ctr = GETU32(ctx->Yi.c+12);
-#endif
-                else
-                        ctr = ctx->Yi.d[3];
-        }
-        (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
-        ++ctr;
-        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-        else
-                ctx->Yi.d[3] = ctr;
-}
-int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
-{
-        size_t i;
-        unsigned int n;
-        u64 alen = ctx->len.u[0];
-#ifdef GCM_FUNCREF_4BIT
-        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
-# ifdef GHASH
-        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-                                const u8 *inp,size_t len)       = ctx->ghash;
-# endif
-#endif
-        if (ctx->len.u[1]) return -2;
-        alen += len;
-        if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
-                return -1;
-        ctx->len.u[0] = alen;
-        n = ctx->ares;
-        if (n) {
-                while (n && len) {
-                        ctx->Xi.c[n] ^= *(aad++);
-                        --len;
-                        n = (n+1)%16;
-                }
-                if (n==0) GCM_MUL(ctx,Xi);
-                else {
-                        ctx->ares = n;
-                        return 0;
-                }
-        }
-#ifdef GHASH
-        if ((i = (len&(size_t)-16))) {
-                GHASH(ctx,aad,i);
-                aad += i;
-                len -= i;
-        }
-#else
-        while (len>=16) {
-                for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
-                GCM_MUL(ctx,Xi);
-                aad += 16;
-                len -= 16;
-        }
-#endif
-        if (len) {
-                n = (unsigned int)len;
-                for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
-        }
-        ctx->ares = n;
-        return 0;
-}
-int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
-                const unsigned char *in, unsigned char *out,
-                size_t len)
-{
-        unsigned int n, ctr;
-        size_t i;
-        u64        mlen  = ctx->len.u[1];
-        block128_f block = ctx->block;
-        void      *key   = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
-# ifdef GHASH
-        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-                                const u8 *inp,size_t len)       = ctx->ghash;
-# endif
-#endif
-        mlen += len;
-        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-                return -1;
-        ctx->len.u[1] = mlen;
-        if (ctx->ares) {
-                /* First call to encrypt finalizes GHASH(AAD) */
-                GCM_MUL(ctx,Xi);
-                ctx->ares = 0;
-        }
-        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                ctr = BSWAP4(ctx->Yi.d[3]);
-#else
-                ctr = GETU32(ctx->Yi.c+12);
-#endif
-        else
-                ctr = ctx->Yi.d[3];
-        n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (16%sizeof(size_t) == 0) do {        /* always true actually */
-                if (n) {
-                        while (n && len) {
-                                ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
-                                --len;
-                                n = (n+1)%16;
-                        }
-                        if (n==0) GCM_MUL(ctx,Xi);
-                        else {
-                                ctx->mres = n;
-                                return 0;
-                        }
-                }
-#ifdef __STRICT_ALIGNMENT
-                if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
-                        break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
-                while (len>=GHASH_CHUNK) {
-                    size_t j=GHASH_CHUNK;
-                    while (j) {
-                        size_t *out_t=(size_t *)out;
-                        const size_t *in_t=(const size_t *)in;
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        for (i=0; i<16/sizeof(size_t); ++i)
-                                out_t[i] = in_t[i] ^ ctx->EKi.t[i];
-                        out += 16;
-                        in  += 16;
-                        j   -= 16;
-                    }
-                    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
-                    len -= GHASH_CHUNK;
-                }
-                if ((i = (len&(size_t)-16))) {
-                    size_t j=i;
-                    while (len>=16) {
-                        size_t *out_t=(size_t *)out;
-                        const size_t *in_t=(const size_t *)in;
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        for (i=0; i<16/sizeof(size_t); ++i)
-                                out_t[i] = in_t[i] ^ ctx->EKi.t[i];
-                        out += 16;
-                        in  += 16;
-                        len -= 16;
-                    }
-                    GHASH(ctx,out-j,j);
-                }
-#else
-                while (len>=16) {
-                        size_t *out_t=(size_t *)out;
-                        const size_t *in_t=(const size_t *)in;
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        for (i=0; i<16/sizeof(size_t); ++i)
-                                ctx->Xi.t[i] ^=
-                                out_t[i] = in_t[i]^ctx->EKi.t[i];
-                        GCM_MUL(ctx,Xi);
-                        out += 16;
-                        in  += 16;
-                        len -= 16;
-                }
-#endif
-                if (len) {
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        while (len--) {
-                                ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
-                                ++n;
-                        }
-                }
-                ctx->mres = n;
-                return 0;
-        } while(0);
-#endif
-        for (i=0;i<len;++i) {
-                if (n==0) {
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                }
-                ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
-                n = (n+1)%16;
-                if (n==0)
-                        GCM_MUL(ctx,Xi);
-        }
-        ctx->mres = n;
-        return 0;
-}
-int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
-                const unsigned char *in, unsigned char *out,
-                size_t len)
-{
-        unsigned int n, ctr;
-        size_t i;
-        u64        mlen  = ctx->len.u[1];
-        block128_f block = ctx->block;
-        void      *key   = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
-# ifdef GHASH
-        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-                                const u8 *inp,size_t len)       = ctx->ghash;
-# endif
-#endif
-        mlen += len;
-        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-                return -1;
-        ctx->len.u[1] = mlen;
-        if (ctx->ares) {
-                /* First call to decrypt finalizes GHASH(AAD) */
-                GCM_MUL(ctx,Xi);
-                ctx->ares = 0;
-        }
-        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                ctr = BSWAP4(ctx->Yi.d[3]);
-#else
-                ctr = GETU32(ctx->Yi.c+12);
-#endif
-        else
-                ctr = ctx->Yi.d[3];
-        n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (16%sizeof(size_t) == 0) do {        /* always true actually */
-                if (n) {
-                        while (n && len) {
-                                u8 c = *(in++);
-                                *(out++) = c^ctx->EKi.c[n];
-                                ctx->Xi.c[n] ^= c;
-                                --len;
-                                n = (n+1)%16;
-                        }
-                        if (n==0) GCM_MUL (ctx,Xi);
-                        else {
-                                ctx->mres = n;
-                                return 0;
-                        }
-                }
-#ifdef __STRICT_ALIGNMENT
-                if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
-                        break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
-                while (len>=GHASH_CHUNK) {
-                    size_t j=GHASH_CHUNK;
-                    GHASH(ctx,in,GHASH_CHUNK);
-                    while (j) {
-                        size_t *out_t=(size_t *)out;
-                        const size_t *in_t=(const size_t *)in;
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        for (i=0; i<16/sizeof(size_t); ++i)
-                                out_t[i] = in_t[i]^ctx->EKi.t[i];
-                        out += 16;
-                        in  += 16;
-                        j   -= 16;
-                    }
-                    len -= GHASH_CHUNK;
-                }
-                if ((i = (len&(size_t)-16))) {
-                    GHASH(ctx,in,i);
-                    while (len>=16) {
-                        size_t *out_t=(size_t *)out;
-                        const size_t *in_t=(const size_t *)in;
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        for (i=0; i<16/sizeof(size_t); ++i)
-                                out_t[i] = in_t[i]^ctx->EKi.t[i];
-                        out += 16;
-                        in  += 16;
-                        len -= 16;
-                    }
-                }
-#else
-                while (len>=16) {
-                        size_t *out_t=(size_t *)out;
-                        const size_t *in_t=(const size_t *)in;
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        for (i=0; i<16/sizeof(size_t); ++i) {
-                                size_t c = in[i];
-                                out[i] = c^ctx->EKi.t[i];
-                                ctx->Xi.t[i] ^= c;
-                        }
-                        GCM_MUL(ctx,Xi);
-                        out += 16;
-                        in  += 16;
-                        len -= 16;
-                }
-#endif
-                if (len) {
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                        while (len--) {
-                                u8 c = in[n];
-                                ctx->Xi.c[n] ^= c;
-                                out[n] = c^ctx->EKi.c[n];
-                                ++n;
-                        }
-                }
-                ctx->mres = n;
-                return 0;
-        } while(0);
-#endif
-        for (i=0;i<len;++i) {
-                u8 c;
-                if (n==0) {
-                        (*block)(ctx->Yi.c,ctx->EKi.c,key);
-                        ++ctr;
-                        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                                ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                                PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                        else
-                                ctx->Yi.d[3] = ctr;
-                }
-                c = in[i];
-                out[i] = c^ctx->EKi.c[n];
-                ctx->Xi.c[n] ^= c;
-                n = (n+1)%16;
-                if (n==0)
-                        GCM_MUL(ctx,Xi);
-        }
-        ctx->mres = n;
-        return 0;
-}
-int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-                const unsigned char *in, unsigned char *out,
-                size_t len, ctr128_f stream)
-{
-        unsigned int n, ctr;
-        size_t i;
-        u64   mlen = ctx->len.u[1];
-        void *key  = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
-# ifdef GHASH
-        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-                                const u8 *inp,size_t len)       = ctx->ghash;
-# endif
-#endif
-        mlen += len;
-        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-                return -1;
-        ctx->len.u[1] = mlen;
-        if (ctx->ares) {
-                /* First call to encrypt finalizes GHASH(AAD) */
-                GCM_MUL(ctx,Xi);
-                ctx->ares = 0;
-        }
-        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                ctr = BSWAP4(ctx->Yi.d[3]);
-#else
-                ctr = GETU32(ctx->Yi.c+12);
-#endif
-        else
-                ctr = ctx->Yi.d[3];
-        n = ctx->mres;
-        if (n) {
-                while (n && len) {
-                        ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
-                        --len;
-                        n = (n+1)%16;
-                }
-                if (n==0) GCM_MUL(ctx,Xi);
-                else {
-                        ctx->mres = n;
-                        return 0;
-                }
-        }
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
-        while (len>=GHASH_CHUNK) {
-                (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
-                ctr += GHASH_CHUNK/16;
-                if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                        ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                        PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                else
-                        ctx->Yi.d[3] = ctr;
-                GHASH(ctx,out,GHASH_CHUNK);
-                out += GHASH_CHUNK;
-                in  += GHASH_CHUNK;
-                len -= GHASH_CHUNK;
-        }
-#endif
-        if ((i = (len&(size_t)-16))) {
-                size_t j=i/16;
-                (*stream)(in,out,j,key,ctx->Yi.c);
-                ctr += (unsigned int)j;
-                if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                        ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                        PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                else
-                        ctx->Yi.d[3] = ctr;
-                in  += i;
-                len -= i;
-#if defined(GHASH)
-                GHASH(ctx,out,i);
-                out += i;
-#else
-                while (j--) {
-                        for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
-                        GCM_MUL(ctx,Xi);
-                        out += 16;
-                }
-#endif
-        }
-        if (len) {
-                (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
-                ++ctr;
-                if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                        ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                        PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                else
-                        ctx->Yi.d[3] = ctr;
-                while (len--) {
-                        ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
-                        ++n;
-                }
-        }
-        ctx->mres = n;
-        return 0;
-}
-int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-                const unsigned char *in, unsigned char *out,
-                size_t len,ctr128_f stream)
-{
-        unsigned int n, ctr;
-        size_t i;
-        u64   mlen = ctx->len.u[1];
-        void *key  = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
-# ifdef GHASH
-        void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
-                                const u8 *inp,size_t len)       = ctx->ghash;
-# endif
-#endif
-        mlen += len;
-        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
-                return -1;
-        ctx->len.u[1] = mlen;
-        if (ctx->ares) {
-                /* First call to decrypt finalizes GHASH(AAD) */
-                GCM_MUL(ctx,Xi);
-                ctx->ares = 0;
-        }
-        if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                ctr = BSWAP4(ctx->Yi.d[3]);
-#else
-                ctr = GETU32(ctx->Yi.c+12);
-#endif
-        else
-                ctr = ctx->Yi.d[3];
-        n = ctx->mres;
-        if (n) {
-                while (n && len) {
-                        u8 c = *(in++);
-                        *(out++) = c^ctx->EKi.c[n];
-                        ctx->Xi.c[n] ^= c;
-                        --len;
-                        n = (n+1)%16;
-                }
-                if (n==0) GCM_MUL (ctx,Xi);
-                else {
-                        ctx->mres = n;
-                        return 0;
-                }
-        }
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
-        while (len>=GHASH_CHUNK) {
-                GHASH(ctx,in,GHASH_CHUNK);
-                (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
-                ctr += GHASH_CHUNK/16;
-                if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                        ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                        PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                else
-                        ctx->Yi.d[3] = ctr;
-                out += GHASH_CHUNK;
-                in  += GHASH_CHUNK;
-                len -= GHASH_CHUNK;
-        }
-#endif
-        if ((i = (len&(size_t)-16))) {
-                size_t j=i/16;
-#if defined(GHASH)
-                GHASH(ctx,in,i);
-#else
-                while (j--) {
-                        size_t k;
-                        for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
-                        GCM_MUL(ctx,Xi);
-                        in += 16;
-                }
-                j   = i/16;
-                in -= i;
-#endif
-                (*stream)(in,out,j,key,ctx->Yi.c);
-                ctr += (unsigned int)j;
-                if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                        ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                        PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                else
-                        ctx->Yi.d[3] = ctr;
-                out += i;
-                in  += i;
-                len -= i;
-        }
-        if (len) {
-                (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
-                ++ctr;
-                if (BYTE_ORDER == LITTLE_ENDIAN)
-#ifdef BSWAP4
-                        ctx->Yi.d[3] = BSWAP4(ctr);
-#else
-                        PUTU32(ctx->Yi.c+12,ctr);
-#endif
-                else
-                        ctx->Yi.d[3] = ctr;
-                while (len--) {
-                        u8 c = in[n];
-                        ctx->Xi.c[n] ^= c;
-                        out[n] = c^ctx->EKi.c[n];
-                        ++n;
-                }
-        }
-        ctx->mres = n;
-        return 0;
-}
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
-                        size_t len)
-{
-        u64 alen = ctx->len.u[0]<<3;
-        u64 clen = ctx->len.u[1]<<3;
-#ifdef GCM_FUNCREF_4BIT
-        void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
-#endif
-        if (ctx->mres || ctx->ares)
-                GCM_MUL(ctx,Xi);
-        if (BYTE_ORDER == LITTLE_ENDIAN) {
-#ifdef BSWAP8
-                alen = BSWAP8(alen);
-                clen = BSWAP8(clen);
-#else
-                u8 *p = ctx->len.c;
-                ctx->len.u[0] = alen;
-                ctx->len.u[1] = clen;
-                alen = (u64)GETU32(p)  <<32|GETU32(p+4);
-                clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
-#endif
-        }
-        ctx->Xi.u[0] ^= alen;
-        ctx->Xi.u[1] ^= clen;
-        GCM_MUL(ctx,Xi);
-        ctx->Xi.u[0] ^= ctx->EK0.u[0];
-        ctx->Xi.u[1] ^= ctx->EK0.u[1];
-        if (tag && len<=sizeof(ctx->Xi))
-                return memcmp(ctx->Xi.c,tag,len);
-        else
-                return -1;
-}
-void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
-{
-        CRYPTO_gcm128_finish(ctx, NULL, 0);
-        memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
-}
-GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
-{
-        GCM128_CONTEXT *ret;
-        if ((ret = malloc(sizeof(GCM128_CONTEXT))))
-                CRYPTO_gcm128_init(ret,key,block);
-        return ret;
-}
-void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
-{
-        if (ctx) {
-                explicit_bzero(ctx,sizeof(*ctx));
-                free(ctx);
-        }
-}
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
deleted file mode 100644
index a532cb3f41..0000000000
--- a/src/lib/libcrypto/modes/modes.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/* $OpenBSD: modes.h,v 1.2 2014/06/12 15:49:30 deraadt Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Rights for redistribution and usage in source and binary
- * forms are granted according to the OpenSSL license.
- */
-#include <stddef.h>
-typedef void (*block128_f)(const unsigned char in[16],
-                        unsigned char out[16],
-                        const void *key);
-typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], int enc);
-typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
-                        size_t blocks, const void *key,
-                        const unsigned char ivec[16]);
-typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
-                        size_t blocks, const void *key,
-                        const unsigned char ivec[16],unsigned char cmac[16]);
-void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block);
-void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block);
-void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], unsigned char ecount_buf[16],
-                        unsigned int *num, block128_f block);
-void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], unsigned char ecount_buf[16],
-                        unsigned int *num, ctr128_f ctr);
-void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], int *num,
-                        block128_f block);
-void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], int *num,
-                        int enc, block128_f block);
-void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t length, const void *key,
-                        unsigned char ivec[16], int *num,
-                        int enc, block128_f block);
-void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t bits, const void *key,
-                        unsigned char ivec[16], int *num,
-                        int enc, block128_f block);
-size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block);
-size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block);
-size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block);
-size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc);
-size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], block128_f block);
-size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], cbc128_f cbc);
-typedef struct gcm128_context GCM128_CONTEXT;
-GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
-void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
-                        size_t len);
-int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
-                        size_t len);
-int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
-                        const unsigned char *in, unsigned char *out,
-                        size_t len);
-int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
-                        const unsigned char *in, unsigned char *out,
-                        size_t len);
-int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-                        const unsigned char *in, unsigned char *out,
-                        size_t len, ctr128_f stream);
-int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-                        const unsigned char *in, unsigned char *out,
-                        size_t len, ctr128_f stream);
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
-                        size_t len);
-void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
-void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
-typedef struct ccm128_context CCM128_CONTEXT;
-void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
-        unsigned int M, unsigned int L, void *key,block128_f block);
-int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
-        const unsigned char *nonce, size_t nlen, size_t mlen);
-void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
-        const unsigned char *aad, size_t alen);
-int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out, size_t len,
-        ccm128_f stream);
-int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
-        const unsigned char *inp, unsigned char *out, size_t len,
-        ccm128_f stream);
-size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
-typedef struct xts128_context XTS128_CONTEXT;
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
-        const unsigned char *inp, unsigned char *out, size_t len, int enc);
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h
deleted file mode 100644
index 8e43e480fc..0000000000
--- a/src/lib/libcrypto/modes/modes_lcl.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* $OpenBSD: modes_lcl.h,v 1.8 2014/07/10 22:45:57 jsing Exp $ */
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use is governed by OpenSSL license.
- * ====================================================================
- */
-#include <machine/endian.h>
-#include <openssl/opensslconf.h>
-#include <openssl/modes.h>
-#if defined(_LP64)
-typedef long i64;
-typedef unsigned long u64;
-#define U64(C) C##UL
-#else
-typedef long long i64;
-typedef unsigned long long u64;
-#define U64(C) C##ULL
-#endif
-typedef unsigned int u32;
-typedef unsigned char u8;
-#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-#if defined(__GNUC__) && __GNUC__>=2
-# if defined(__x86_64) || defined(__x86_64__)
-#  define BSWAP8(x) ({  u64 ret=(x);                    \
-                        asm ("bswapq %0"                \
-                        : "+r"(ret));   ret;            })
-#  define BSWAP4(x) ({  u32 ret=(x);                    \
-                        asm ("bswapl %0"                \
-                        : "+r"(ret));   ret;            })
-# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
-#  define BSWAP8(x) ({  u32 lo=(u64)(x)>>32,hi=(x);     \
-                        asm ("bswapl %0; bswapl %1"     \
-                        : "+r"(hi),"+r"(lo));           \
-                        (u64)hi<<32|lo;                 })
-#  define BSWAP4(x) ({  u32 ret=(x);                    \
-                        asm ("bswapl %0"                \
-                        : "+r"(ret));   ret;            })
-# elif (defined(__arm__) || defined(__arm)) && !defined(__STRICT_ALIGNMENT)
-#  define BSWAP8(x) ({  u32 lo=(u64)(x)>>32,hi=(x);     \
-                        asm ("rev %0,%0; rev %1,%1"     \
-                        : "+r"(hi),"+r"(lo));           \
-                        (u64)hi<<32|lo;                 })
-#  define BSWAP4(x) ({  u32 ret;                        \
-                        asm ("rev %0,%1"                \
-                        : "=r"(ret) : "r"((u32)(x)));   \
-                        ret;                            })
-# endif
-#endif
-#endif
-#if defined(BSWAP4) && !defined(__STRICT_ALIGNMENT)
-#define GETU32(p)       BSWAP4(*(const u32 *)(p))
-#define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
-#else
-#define GETU32(p)       ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
-#define PUTU32(p,v)     ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
-#endif
-/* GCM definitions */
-typedef struct { u64 hi,lo; } u128;
-#ifdef  TABLE_BITS
-#undef  TABLE_BITS
-#endif
-/*
- * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
- * never be set to 8 [or 1]. For further information see gcm128.c.
- */
-#define TABLE_BITS 4
-struct gcm128_context {
-        /* Following 6 names follow names in GCM specification */
-        union { u64 u[2]; u32 d[4]; u8 c[16]; size_t t[16/sizeof(size_t)]; }
-          Yi,EKi,EK0,len,Xi,H;
-        /* Relative position of Xi, H and pre-computed Htable is used
-         * in some assembler modules, i.e. don't change the order! */
-#if TABLE_BITS==8
-        u128 Htable[256];
-#else
-        u128 Htable[16];
-        void (*gmult)(u64 Xi[2],const u128 Htable[16]);
-        void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
-#endif
-        unsigned int mres, ares;
-        block128_f block;
-        void *key;
-};
-struct xts128_context {
-        void      *key1, *key2;
-        block128_f block1,block2;
-};
-struct ccm128_context {
-        union { u64 u[2]; u8 c[16]; } nonce, cmac;
-        u64 blocks;
-        block128_f block;
-        void *key;
-};
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
deleted file mode 100644
index 1b8a6fd500..0000000000
--- a/src/lib/libcrypto/modes/ofb128.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/* $OpenBSD: ofb128.c,v 1.4 2015/02/10 09:46:30 miod Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-/* The input and output encrypted as though 128bit ofb mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
- */
-void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-                        size_t len, const void *key,
-                        unsigned char ivec[16], int *num,
-                        block128_f block)
-{
-        unsigned int n;
-        size_t l=0;
-        n = *num;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-        if (16%sizeof(size_t) == 0) do { /* always true actually */
-                while (n && len) {
-                        *(out++) = *(in++) ^ ivec[n];
-                        --len;
-                        n = (n+1) % 16;
-                }
-#ifdef __STRICT_ALIGNMENT
-                if (((size_t)in|(size_t)out|(size_t)ivec)%sizeof(size_t) != 0)
-                        break;
-#endif
-                while (len>=16) {
-                        (*block)(ivec, ivec, key);
-                        for (; n<16; n+=sizeof(size_t))
-                                *(size_t*)(out+n) =
-                                *(size_t*)(in+n) ^ *(size_t*)(ivec+n);
-                        len -= 16;
-                        out += 16;
-                        in  += 16;
-                        n = 0;
-                }
-                if (len) {
-                        (*block)(ivec, ivec, key);
-                        while (len--) {
-                                out[n] = in[n] ^ ivec[n];
-                                ++n;
-                        }
-                }
-                *num = n;
-                return;
-        } while(0);
-        /* the rest would be commonly eliminated by x86* compiler */
-#endif
-        while (l<len) {
-                if (n==0) {
-                        (*block)(ivec, ivec, key);
-                }
-                out[l] = in[l] ^ ivec[n];
-                ++l;
-                n = (n+1) % 16;
-        }
-        *num=n;
-}
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
deleted file mode 100644
index 3e2378379e..0000000000
--- a/src/lib/libcrypto/modes/xts128.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/* $OpenBSD: xts128.c,v 1.6 2015/02/10 09:46:30 miod Exp $ */
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-#include <machine/endian.h>
-#include <openssl/crypto.h>
-#include "modes_lcl.h"
-#include <string.h>
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
-        const unsigned char *inp, unsigned char *out,
-        size_t len, int enc)
-{
-        union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
-        unsigned int i;
-        if (len<16) return -1;
-        memcpy(tweak.c, iv, 16);
-        (*ctx->block2)(tweak.c,tweak.c,ctx->key2);
-        if (!enc && (len%16)) len-=16;
-        while (len>=16) {
-#ifdef __STRICT_ALIGNMENT
-                memcpy(scratch.c,inp,16);
-                scratch.u[0] ^= tweak.u[0];
-                scratch.u[1] ^= tweak.u[1];
-#else
-                scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
-                scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
-#endif
-                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-#ifdef __STRICT_ALIGNMENT
-                scratch.u[0] ^= tweak.u[0];
-                scratch.u[1] ^= tweak.u[1];
-                memcpy(out,scratch.c,16);
-#else
-                ((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
-                ((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
-#endif
-                inp += 16;
-                out += 16;
-                len -= 16;
-                if (len==0)     return 0;
-                if (BYTE_ORDER == LITTLE_ENDIAN) {
-                        unsigned int carry,res;
-                        
-                        res = 0x87&(((int)tweak.d[3])>>31);
-                        carry = (unsigned int)(tweak.u[0]>>63);
-                        tweak.u[0] = (tweak.u[0]<<1)^res;
-                        tweak.u[1] = (tweak.u[1]<<1)|carry;
-                }
-                else {
-                        size_t c;
-                        for (c=0,i=0;i<16;++i) {
-                                /*+ substitutes for |, because c is 1 bit */ 
-                                c += ((size_t)tweak.c[i])<<1;
-                                tweak.c[i] = (u8)c;
-                                c = c>>8;
-                        }
-                        tweak.c[0] ^= (u8)(0x87&(0-c));
-                }
-        }
-        if (enc) {
-                for (i=0;i<len;++i) {
-                        u8 c = inp[i];
-                        out[i] = scratch.c[i];
-                        scratch.c[i] = c;
-                }
-                scratch.u[0] ^= tweak.u[0];
-                scratch.u[1] ^= tweak.u[1];
-                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-                scratch.u[0] ^= tweak.u[0];
-                scratch.u[1] ^= tweak.u[1];
-                memcpy(out-16,scratch.c,16);
-        }
-        else {
-                union { u64 u[2]; u8 c[16]; } tweak1;
-                if (BYTE_ORDER == LITTLE_ENDIAN) {
-                        unsigned int carry,res;
-                        res = 0x87&(((int)tweak.d[3])>>31);
-                        carry = (unsigned int)(tweak.u[0]>>63);
-                        tweak1.u[0] = (tweak.u[0]<<1)^res;
-                        tweak1.u[1] = (tweak.u[1]<<1)|carry;
-                }
-                else {
-                        size_t c;
-                        for (c=0,i=0;i<16;++i) {
-                                /*+ substitutes for |, because c is 1 bit */ 
-                                c += ((size_t)tweak.c[i])<<1;
-                                tweak1.c[i] = (u8)c;
-                                c = c>>8;
-                        }
-                        tweak1.c[0] ^= (u8)(0x87&(0-c));
-                }
-#ifdef __STRICT_ALIGNMENT
-                memcpy(scratch.c,inp,16);
-                scratch.u[0] ^= tweak1.u[0];
-                scratch.u[1] ^= tweak1.u[1];
-#else
-                scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
-                scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
-#endif
-                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-                scratch.u[0] ^= tweak1.u[0];
-                scratch.u[1] ^= tweak1.u[1];
-                for (i=0;i<len;++i) {
-                        u8 c = inp[16+i];
-                        out[16+i] = scratch.c[i];
-                        scratch.c[i] = c;
-                }
-                scratch.u[0] ^= tweak.u[0];
-                scratch.u[1] ^= tweak.u[1];
-                (*ctx->block1)(scratch.c,scratch.c,ctx->key1);
-#ifdef __STRICT_ALIGNMENT
-                scratch.u[0] ^= tweak.u[0];
-                scratch.u[1] ^= tweak.u[1];
-                memcpy (out,scratch.c,16);
-#else
-                ((u64*)out)[0] = scratch.u[0]^tweak.u[0];
-                ((u64*)out)[1] = scratch.u[1]^tweak.u[1];
-#endif
-        }
-        return 0;
-}