From eb8dd9dca1228af0cd132f515509051ecfabf6f6 Mon Sep 17 00:00:00 2001
From: cvs2svn <admin@example.com>
Date: Mon, 14 Apr 2025 17:32:06 +0000
Subject: This commit was manufactured by cvs2git to create tag 'tb_20250414'.

---
 src/lib/libcrypto/modes/asm/ghash-alpha.pl   |  444 ---------
 src/lib/libcrypto/modes/asm/ghash-armv4.pl   |  430 --------
 src/lib/libcrypto/modes/asm/ghash-parisc.pl  |  740 --------------
 src/lib/libcrypto/modes/asm/ghash-sparcv9.pl |  351 -------
 src/lib/libcrypto/modes/asm/ghash-x86.pl     | 1326 -------------------------
 src/lib/libcrypto/modes/asm/ghash-x86_64.pl  |  812 ---------------
 src/lib/libcrypto/modes/cbc128.c             |  214 ----
 src/lib/libcrypto/modes/ccm128.c             |  498 ----------
 src/lib/libcrypto/modes/cfb128.c             |  251 -----
 src/lib/libcrypto/modes/ctr128.c             |  267 -----
 src/lib/libcrypto/modes/gcm128.c             | 1358 --------------------------
 src/lib/libcrypto/modes/modes.h              |  118 ---
 src/lib/libcrypto/modes/modes_local.h        |  121 ---
 src/lib/libcrypto/modes/ofb128.c             |  124 ---
 src/lib/libcrypto/modes/xts128.c             |  197 ----
 15 files changed, 7251 deletions(-)
 delete mode 100644 src/lib/libcrypto/modes/asm/ghash-alpha.pl
 delete mode 100644 src/lib/libcrypto/modes/asm/ghash-armv4.pl
 delete mode 100644 src/lib/libcrypto/modes/asm/ghash-parisc.pl
 delete mode 100644 src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
 delete mode 100644 src/lib/libcrypto/modes/asm/ghash-x86.pl
 delete mode 100644 src/lib/libcrypto/modes/asm/ghash-x86_64.pl
 delete mode 100644 src/lib/libcrypto/modes/cbc128.c
 delete mode 100644 src/lib/libcrypto/modes/ccm128.c
 delete mode 100644 src/lib/libcrypto/modes/cfb128.c
 delete mode 100644 src/lib/libcrypto/modes/ctr128.c
 delete mode 100644 src/lib/libcrypto/modes/gcm128.c
 delete mode 100644 src/lib/libcrypto/modes/modes.h
 delete mode 100644 src/lib/libcrypto/modes/modes_local.h
 delete mode 100644 src/lib/libcrypto/modes/ofb128.c
 delete mode 100644 src/lib/libcrypto/modes/xts128.c

(limited to 'src/lib/libcrypto/modes')

diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
deleted file mode 100644
index 9d847006c4..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-alpha.pl
+++ /dev/null
@@ -1,444 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Even though
-# loops are aggressively modulo-scheduled in respect to references to
-# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
-# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
-# scheduling "glitch," because uprofile(1) indicates uniform sample
-# distribution, as if all instruction bundles execute in 1.5 cycles.
-# Meaning that it could have been even faster, yet 12 cycles is ~60%
-# better than gcc-generated code and ~80% than code generated by vendor
-# compiler.
-
-$cnt="v0";	# $0
-$t0="t0";
-$t1="t1";
-$t2="t2";
-$Thi0="t3";	# $4
-$Tlo0="t4";
-$Thi1="t5";
-$Tlo1="t6";
-$rem="t7";	# $8
-#################
-$Xi="a0";	# $16, input argument block
-$Htbl="a1";
-$inp="a2";
-$len="a3";
-$nlo="a4";	# $20
-$nhi="a5";
-$Zhi="t8";
-$Zlo="t9";
-$Xhi="t10";	# $24
-$Xlo="t11";
-$remp="t12";
-$rem_4bit="AT";	# $28
-
-{ my $N;
-  sub loop() {
-
-	$N++;
-$code.=<<___;
-.align	4
-	extbl	$Xlo,7,$nlo
-	and	$nlo,0xf0,$nhi
-	sll	$nlo,4,$nlo
-	and	$nlo,0xf0,$nlo
-
-	addq	$nlo,$Htbl,$nlo
-	ldq	$Zlo,8($nlo)
-	addq	$nhi,$Htbl,$nhi
-	ldq	$Zhi,0($nlo)
-
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	lda	$cnt,6(zero)
-	extbl	$Xlo,6,$nlo
-
-	ldq	$Tlo1,8($nhi)
-	s8addq	$remp,$rem_4bit,$remp
-	ldq	$Thi1,0($nhi)
-	srl	$Zlo,4,$Zlo
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	xor	$t0,$Zlo,$Zlo
-	and	$nlo,0xf0,$nhi
-
-	xor	$Tlo1,$Zlo,$Zlo
-	sll	$nlo,4,$nlo
-	xor	$Thi1,$Zhi,$Zhi
-	and	$nlo,0xf0,$nlo
-
-	addq	$nlo,$Htbl,$nlo
-	ldq	$Tlo0,8($nlo)
-	addq	$nhi,$Htbl,$nhi
-	ldq	$Thi0,0($nlo)
-
-.Looplo$N:
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	subq	$cnt,1,$cnt
-	srl	$Zlo,4,$Zlo
-
-	ldq	$Tlo1,8($nhi)
-	xor	$rem,$Zhi,$Zhi
-	ldq	$Thi1,0($nhi)
-	s8addq	$remp,$rem_4bit,$remp
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	xor	$t0,$Zlo,$Zlo
-	extbl	$Xlo,$cnt,$nlo
-
-	and	$nlo,0xf0,$nhi
-	xor	$Thi0,$Zhi,$Zhi
-	xor	$Tlo0,$Zlo,$Zlo
-	sll	$nlo,4,$nlo
-
-
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	and	$nlo,0xf0,$nlo
-	srl	$Zlo,4,$Zlo
-
-	s8addq	$remp,$rem_4bit,$remp
-	xor	$rem,$Zhi,$Zhi
-	addq	$nlo,$Htbl,$nlo
-	addq	$nhi,$Htbl,$nhi
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	ldq	$Tlo0,8($nlo)
-	xor	$t0,$Zlo,$Zlo
-
-	xor	$Tlo1,$Zlo,$Zlo
-	xor	$Thi1,$Zhi,$Zhi
-	ldq	$Thi0,0($nlo)
-	bne	$cnt,.Looplo$N
-
-
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	lda	$cnt,7(zero)
-	srl	$Zlo,4,$Zlo
-
-	ldq	$Tlo1,8($nhi)
-	xor	$rem,$Zhi,$Zhi
-	ldq	$Thi1,0($nhi)
-	s8addq	$remp,$rem_4bit,$remp
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	xor	$t0,$Zlo,$Zlo
-	extbl	$Xhi,$cnt,$nlo
-
-	and	$nlo,0xf0,$nhi
-	xor	$Thi0,$Zhi,$Zhi
-	xor	$Tlo0,$Zlo,$Zlo
-	sll	$nlo,4,$nlo
-
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	and	$nlo,0xf0,$nlo
-	srl	$Zlo,4,$Zlo
-
-	s8addq	$remp,$rem_4bit,$remp
-	xor	$rem,$Zhi,$Zhi
-	addq	$nlo,$Htbl,$nlo
-	addq	$nhi,$Htbl,$nhi
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	ldq	$Tlo0,8($nlo)
-	xor	$t0,$Zlo,$Zlo
-
-	xor	$Tlo1,$Zlo,$Zlo
-	xor	$Thi1,$Zhi,$Zhi
-	ldq	$Thi0,0($nlo)
-	unop
-
-
-.Loophi$N:
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	subq	$cnt,1,$cnt
-	srl	$Zlo,4,$Zlo
-
-	ldq	$Tlo1,8($nhi)
-	xor	$rem,$Zhi,$Zhi
-	ldq	$Thi1,0($nhi)
-	s8addq	$remp,$rem_4bit,$remp
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	xor	$t0,$Zlo,$Zlo
-	extbl	$Xhi,$cnt,$nlo
-
-	and	$nlo,0xf0,$nhi
-	xor	$Thi0,$Zhi,$Zhi
-	xor	$Tlo0,$Zlo,$Zlo
-	sll	$nlo,4,$nlo
-
-
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	and	$nlo,0xf0,$nlo
-	srl	$Zlo,4,$Zlo
-
-	s8addq	$remp,$rem_4bit,$remp
-	xor	$rem,$Zhi,$Zhi
-	addq	$nlo,$Htbl,$nlo
-	addq	$nhi,$Htbl,$nhi
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	ldq	$Tlo0,8($nlo)
-	xor	$t0,$Zlo,$Zlo
-
-	xor	$Tlo1,$Zlo,$Zlo
-	xor	$Thi1,$Zhi,$Zhi
-	ldq	$Thi0,0($nlo)
-	bne	$cnt,.Loophi$N
-
-
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	srl	$Zlo,4,$Zlo
-
-	ldq	$Tlo1,8($nhi)
-	xor	$rem,$Zhi,$Zhi
-	ldq	$Thi1,0($nhi)
-	s8addq	$remp,$rem_4bit,$remp
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	xor	$t0,$Zlo,$Zlo
-
-	xor	$Tlo0,$Zlo,$Zlo
-	xor	$Thi0,$Zhi,$Zhi
-
-	and	$Zlo,0x0f,$remp
-	sll	$Zhi,60,$t0
-	srl	$Zlo,4,$Zlo
-
-	s8addq	$remp,$rem_4bit,$remp
-	xor	$rem,$Zhi,$Zhi
-
-	ldq	$rem,0($remp)
-	srl	$Zhi,4,$Zhi
-	xor	$Tlo1,$Zlo,$Zlo
-	xor	$Thi1,$Zhi,$Zhi
-	xor	$t0,$Zlo,$Zlo
-	xor	$rem,$Zhi,$Zhi
-___
-}}
-
-$code=<<___;
-#include <machine/asm.h>
-
-.text
-
-.set	noat
-.set	noreorder
-.globl	gcm_gmult_4bit
-.align	4
-.ent	gcm_gmult_4bit
-gcm_gmult_4bit:
-	.frame	sp,0,ra
-	.prologue 0
-
-	ldq	$Xlo,8($Xi)
-	ldq	$Xhi,0($Xi)
-
-	lda	$rem_4bit,rem_4bit
-___
-
-	&loop();
-
-$code.=<<___;
-	srl	$Zlo,24,$t0	# byte swap
-	srl	$Zlo,8,$t1
-
-	sll	$Zlo,8,$t2
-	sll	$Zlo,24,$Zlo
-	zapnot	$t0,0x11,$t0
-	zapnot	$t1,0x22,$t1
-
-	zapnot	$Zlo,0x88,$Zlo
-	or	$t0,$t1,$t0
-	zapnot	$t2,0x44,$t2
-
-	or	$Zlo,$t0,$Zlo
-	srl	$Zhi,24,$t0
-	srl	$Zhi,8,$t1
-
-	or	$Zlo,$t2,$Zlo
-	sll	$Zhi,8,$t2
-	sll	$Zhi,24,$Zhi
-
-	srl	$Zlo,32,$Xlo
-	sll	$Zlo,32,$Zlo
-
-	zapnot	$t0,0x11,$t0
-	zapnot	$t1,0x22,$t1
-	or	$Zlo,$Xlo,$Xlo
-
-	zapnot	$Zhi,0x88,$Zhi
-	or	$t0,$t1,$t0
-	zapnot	$t2,0x44,$t2
-
-	or	$Zhi,$t0,$Zhi
-	or	$Zhi,$t2,$Zhi
-
-	srl	$Zhi,32,$Xhi
-	sll	$Zhi,32,$Zhi
-
-	or	$Zhi,$Xhi,$Xhi
-	stq	$Xlo,8($Xi)
-	stq	$Xhi,0($Xi)
-
-	ret	(ra)
-.end	gcm_gmult_4bit
-___
-
-$inhi="s0";
-$inlo="s1";
-
-$code.=<<___;
-.globl	gcm_ghash_4bit
-.align	4
-.ent	gcm_ghash_4bit
-gcm_ghash_4bit:
-	lda	sp,-32(sp)
-	stq	ra,0(sp)
-	stq	s0,8(sp)
-	stq	s1,16(sp)
-	.mask	0x04000600,-32
-	.frame	sp,32,ra
-	.prologue 0
-
-	ldq_u	$inhi,0($inp)
-	ldq_u	$Thi0,7($inp)
-	ldq_u	$inlo,8($inp)
-	ldq_u	$Tlo0,15($inp)
-	ldq	$Xhi,0($Xi)
-	ldq	$Xlo,8($Xi)
-
-	lda	$rem_4bit,rem_4bit
-
-.Louter:
-	extql	$inhi,$inp,$inhi
-	extqh	$Thi0,$inp,$Thi0
-	or	$inhi,$Thi0,$inhi
-	lda	$inp,16($inp)
-
-	extql	$inlo,$inp,$inlo
-	extqh	$Tlo0,$inp,$Tlo0
-	or	$inlo,$Tlo0,$inlo
-	subq	$len,16,$len
-
-	xor	$Xlo,$inlo,$Xlo
-	xor	$Xhi,$inhi,$Xhi
-___
-
-	&loop();
-
-$code.=<<___;
-	srl	$Zlo,24,$t0	# byte swap
-	srl	$Zlo,8,$t1
-
-	sll	$Zlo,8,$t2
-	sll	$Zlo,24,$Zlo
-	zapnot	$t0,0x11,$t0
-	zapnot	$t1,0x22,$t1
-
-	zapnot	$Zlo,0x88,$Zlo
-	or	$t0,$t1,$t0
-	zapnot	$t2,0x44,$t2
-
-	or	$Zlo,$t0,$Zlo
-	srl	$Zhi,24,$t0
-	srl	$Zhi,8,$t1
-
-	or	$Zlo,$t2,$Zlo
-	sll	$Zhi,8,$t2
-	sll	$Zhi,24,$Zhi
-
-	srl	$Zlo,32,$Xlo
-	sll	$Zlo,32,$Zlo
-	beq	$len,.Ldone
-
-	zapnot	$t0,0x11,$t0
-	zapnot	$t1,0x22,$t1
-	or	$Zlo,$Xlo,$Xlo
-	ldq_u	$inhi,0($inp)
-
-	zapnot	$Zhi,0x88,$Zhi
-	or	$t0,$t1,$t0
-	zapnot	$t2,0x44,$t2
-	ldq_u	$Thi0,7($inp)
-
-	or	$Zhi,$t0,$Zhi
-	or	$Zhi,$t2,$Zhi
-	ldq_u	$inlo,8($inp)
-	ldq_u	$Tlo0,15($inp)
-
-	srl	$Zhi,32,$Xhi
-	sll	$Zhi,32,$Zhi
-
-	or	$Zhi,$Xhi,$Xhi
-	br	zero,.Louter
-
-.Ldone:
-	zapnot	$t0,0x11,$t0
-	zapnot	$t1,0x22,$t1
-	or	$Zlo,$Xlo,$Xlo
-
-	zapnot	$Zhi,0x88,$Zhi
-	or	$t0,$t1,$t0
-	zapnot	$t2,0x44,$t2
-
-	or	$Zhi,$t0,$Zhi
-	or	$Zhi,$t2,$Zhi
-
-	srl	$Zhi,32,$Xhi
-	sll	$Zhi,32,$Zhi
-
-	or	$Zhi,$Xhi,$Xhi
-
-	stq	$Xlo,8($Xi)
-	stq	$Xhi,0($Xi)
-
-	.set	noreorder
-	/*ldq	ra,0(sp)*/
-	ldq	s0,8(sp)
-	ldq	s1,16(sp)
-	lda	sp,32(sp)
-	ret	(ra)
-.end	gcm_ghash_4bit
-
-	.section .rodata
-	.align	4
-rem_4bit:
-	.long	0,0x0000<<16, 0,0x1C20<<16, 0,0x3840<<16, 0,0x2460<<16
-	.long	0,0x7080<<16, 0,0x6CA0<<16, 0,0x48C0<<16, 0,0x54E0<<16
-	.long	0,0xE100<<16, 0,0xFD20<<16, 0,0xD940<<16, 0,0xC560<<16
-	.long	0,0x9180<<16, 0,0x8DA0<<16, 0,0xA9C0<<16, 0,0xB5E0<<16
-	.previous
-
-___
-$output=shift and open STDOUT,">$output";
-print $code;
-close STDOUT;
-
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
deleted file mode 100644
index 2d57806b46..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-armv4.pl
+++ /dev/null
@@ -1,430 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# April 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+32 bytes shared table]. There is no
-# experimental performance data available yet. The only approximation
-# that can be made at this point is based on code size. Inner loop is
-# 32 instructions long and on single-issue core should execute in <40
-# cycles. Having verified that gcc 3.4 didn't unroll corresponding
-# loop, this assembler loop body was found to be ~3x smaller than
-# compiler-generated one...
-#
-# July 2010
-#
-# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
-# Cortex A8 core and ~25 cycles per processed byte (which was observed
-# to be ~3 times faster than gcc-generated code:-)
-#
-# February 2011
-#
-# Profiler-assisted and platform-specific optimization resulted in 7%
-# improvement on Cortex A8 core and ~23.5 cycles per byte.
-#
-# March 2011
-#
-# Add NEON implementation featuring polynomial multiplication, i.e. no
-# lookup tables involved. On Cortex A8 it was measured to process one
-# byte in 15 cycles or 55% faster than integer-only code.
-
-# ====================================================================
-# Note about "528B" variant. In ARM case it makes lesser sense to
-# implement it for following reasons:
-#
-# - performance improvement won't be anywhere near 50%, because 128-
-#   bit shift operation is neatly fused with 128-bit xor here, and
-#   "538B" variant would eliminate only 4-5 instructions out of 32
-#   in the inner loop (meaning that estimated improvement is ~15%);
-# - ARM-based systems are often embedded ones and extra memory
-#   consumption might be unappreciated (for so little improvement);
-#
-# Byte order [in]dependence. =========================================
-#
-# Caller is expected to maintain specific *dword* order in Htable,
-# namely with *least* significant dword of 128-bit value at *lower*
-# address. This differs completely from C code and has everything to
-# do with ldm instruction and order in which dwords are "consumed" by
-# algorithm. *Byte* order within these dwords in turn is whatever
-# *native* byte order on current platform. See gcm128.c for working
-# example...
-
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-$Xi="r0";	# argument block
-$Htbl="r1";
-$inp="r2";
-$len="r3";
-
-$Zll="r4";	# variables
-$Zlh="r5";
-$Zhl="r6";
-$Zhh="r7";
-$Tll="r8";
-$Tlh="r9";
-$Thl="r10";
-$Thh="r11";
-$nlo="r12";
-################# r13 is stack pointer
-$nhi="r14";
-################# r15 is program counter
-
-$rem_4bit=$inp;	# used in gcm_gmult_4bit
-$cnt=$len;
-
-sub Zsmash() {
-  my $i=12;
-  my @args=@_;
-  for ($Zll,$Zlh,$Zhl,$Zhh) {
-    $code.=<<___;
-#if __ARM_ARCH__>=7 && defined(__ARMEL__)
-	rev	$_,$_
-	str	$_,[$Xi,#$i]
-#elif defined(__ARMEB__)
-	str	$_,[$Xi,#$i]
-#else
-	mov	$Tlh,$_,lsr#8
-	strb	$_,[$Xi,#$i+3]
-	mov	$Thl,$_,lsr#16
-	strb	$Tlh,[$Xi,#$i+2]
-	mov	$Thh,$_,lsr#24
-	strb	$Thl,[$Xi,#$i+1]
-	strb	$Thh,[$Xi,#$i]
-#endif
-___
-    $code.="\t".shift(@args)."\n";
-    $i-=4;
-  }
-}
-
-$code=<<___;
-#include "arm_arch.h"
-
-.text
-.syntax	unified
-.code	32
-
-.type	rem_4bit,%object
-.align	5
-rem_4bit:
-.short	0x0000,0x1C20,0x3840,0x2460
-.short	0x7080,0x6CA0,0x48C0,0x54E0
-.short	0xE100,0xFD20,0xD940,0xC560
-.short	0x9180,0x8DA0,0xA9C0,0xB5E0
-.size	rem_4bit,.-rem_4bit
-
-.type	rem_4bit_get,%function
-rem_4bit_get:
-	sub	$rem_4bit,pc,#8
-	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
-	b	.Lrem_4bit_got
-	nop
-.size	rem_4bit_get,.-rem_4bit_get
-
-.global	gcm_ghash_4bit
-.type	gcm_ghash_4bit,%function
-gcm_ghash_4bit:
-	sub	r12,pc,#8
-	add	$len,$inp,$len		@ $len to point at the end
-	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
-	sub	r12,r12,#48		@ &rem_4bit
-
-	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
-	stmdb	sp!,{r4-r11}		@ ... to stack
-
-	ldrb	$nlo,[$inp,#15]
-	ldrb	$nhi,[$Xi,#15]
-.Louter:
-	eor	$nlo,$nlo,$nhi
-	and	$nhi,$nlo,#0xf0
-	and	$nlo,$nlo,#0x0f
-	mov	$cnt,#14
-
-	add	$Zhh,$Htbl,$nlo,lsl#4
-	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
-	add	$Thh,$Htbl,$nhi
-	ldrb	$nlo,[$inp,#14]
-
-	and	$nhi,$Zll,#0xf		@ rem
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	add	$nhi,$nhi,$nhi
-	eor	$Zll,$Tll,$Zll,lsr#4
-	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
-	eor	$Zll,$Zll,$Zlh,lsl#28
-	ldrb	$nhi,[$Xi,#14]
-	eor	$Zlh,$Tlh,$Zlh,lsr#4
-	eor	$Zlh,$Zlh,$Zhl,lsl#28
-	eor	$Zhl,$Thl,$Zhl,lsr#4
-	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eor	$Zhh,$Thh,$Zhh,lsr#4
-	eor	$nlo,$nlo,$nhi
-	and	$nhi,$nlo,#0xf0
-	and	$nlo,$nlo,#0x0f
-	eor	$Zhh,$Zhh,$Tll,lsl#16
-
-.Linner:
-	add	$Thh,$Htbl,$nlo,lsl#4
-	and	$nlo,$Zll,#0xf		@ rem
-	subs	$cnt,$cnt,#1
-	add	$nlo,$nlo,$nlo
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
-	eor	$Zll,$Tll,$Zll,lsr#4
-	eor	$Zll,$Zll,$Zlh,lsl#28
-	eor	$Zlh,$Tlh,$Zlh,lsr#4
-	eor	$Zlh,$Zlh,$Zhl,lsl#28
-	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
-	eor	$Zhl,$Thl,$Zhl,lsr#4
-	ldrbpl	$nlo,[$inp,$cnt]
-	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eor	$Zhh,$Thh,$Zhh,lsr#4
-
-	add	$Thh,$Htbl,$nhi
-	and	$nhi,$Zll,#0xf		@ rem
-	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
-	add	$nhi,$nhi,$nhi
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	eor	$Zll,$Tll,$Zll,lsr#4
-	ldrbpl	$Tll,[$Xi,$cnt]
-	eor	$Zll,$Zll,$Zlh,lsl#28
-	eor	$Zlh,$Tlh,$Zlh,lsr#4
-	ldrh	$Tlh,[sp,$nhi]
-	eor	$Zlh,$Zlh,$Zhl,lsl#28
-	eor	$Zhl,$Thl,$Zhl,lsr#4
-	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eorpl	$nlo,$nlo,$Tll
-	eor	$Zhh,$Thh,$Zhh,lsr#4
-	andpl	$nhi,$nlo,#0xf0
-	andpl	$nlo,$nlo,#0x0f
-	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
-	bpl	.Linner
-
-	ldr	$len,[sp,#32]		@ re-load $len/end
-	add	$inp,$inp,#16
-	mov	$nhi,$Zll
-___
-	&Zsmash("cmp\t$inp,$len","ldrbne\t$nlo,[$inp,#15]");
-$code.=<<___;
-	bne	.Louter
-
-	add	sp,sp,#36
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4-r11,pc}
-#else
-	ldmia	sp!,{r4-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	gcm_ghash_4bit,.-gcm_ghash_4bit
-
-.global	gcm_gmult_4bit
-.type	gcm_gmult_4bit,%function
-gcm_gmult_4bit:
-	stmdb	sp!,{r4-r11,lr}
-	ldrb	$nlo,[$Xi,#15]
-	b	rem_4bit_get
-.Lrem_4bit_got:
-	and	$nhi,$nlo,#0xf0
-	and	$nlo,$nlo,#0x0f
-	mov	$cnt,#14
-
-	add	$Zhh,$Htbl,$nlo,lsl#4
-	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
-	ldrb	$nlo,[$Xi,#14]
-
-	add	$Thh,$Htbl,$nhi
-	and	$nhi,$Zll,#0xf		@ rem
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	add	$nhi,$nhi,$nhi
-	eor	$Zll,$Tll,$Zll,lsr#4
-	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
-	eor	$Zll,$Zll,$Zlh,lsl#28
-	eor	$Zlh,$Tlh,$Zlh,lsr#4
-	eor	$Zlh,$Zlh,$Zhl,lsl#28
-	eor	$Zhl,$Thl,$Zhl,lsr#4
-	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eor	$Zhh,$Thh,$Zhh,lsr#4
-	and	$nhi,$nlo,#0xf0
-	eor	$Zhh,$Zhh,$Tll,lsl#16
-	and	$nlo,$nlo,#0x0f
-
-.Loop:
-	add	$Thh,$Htbl,$nlo,lsl#4
-	and	$nlo,$Zll,#0xf		@ rem
-	subs	$cnt,$cnt,#1
-	add	$nlo,$nlo,$nlo
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
-	eor	$Zll,$Tll,$Zll,lsr#4
-	eor	$Zll,$Zll,$Zlh,lsl#28
-	eor	$Zlh,$Tlh,$Zlh,lsr#4
-	eor	$Zlh,$Zlh,$Zhl,lsl#28
-	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
-	eor	$Zhl,$Thl,$Zhl,lsr#4
-	ldrbpl	$nlo,[$Xi,$cnt]
-	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eor	$Zhh,$Thh,$Zhh,lsr#4
-
-	add	$Thh,$Htbl,$nhi
-	and	$nhi,$Zll,#0xf		@ rem
-	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
-	add	$nhi,$nhi,$nhi
-	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
-	eor	$Zll,$Tll,$Zll,lsr#4
-	eor	$Zll,$Zll,$Zlh,lsl#28
-	eor	$Zlh,$Tlh,$Zlh,lsr#4
-	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
-	eor	$Zlh,$Zlh,$Zhl,lsl#28
-	eor	$Zhl,$Thl,$Zhl,lsr#4
-	eor	$Zhl,$Zhl,$Zhh,lsl#28
-	eor	$Zhh,$Thh,$Zhh,lsr#4
-	andpl	$nhi,$nlo,#0xf0
-	andpl	$nlo,$nlo,#0x0f
-	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
-	bpl	.Loop
-___
-	&Zsmash();
-$code.=<<___;
-#if __ARM_ARCH__>=5
-	ldmia	sp!,{r4-r11,pc}
-#else
-	ldmia	sp!,{r4-r11,lr}
-	tst	lr,#1
-	moveq	pc,lr			@ be binary compatible with V4, yet
-	bx	lr			@ interoperable with Thumb ISA:-)
-#endif
-.size	gcm_gmult_4bit,.-gcm_gmult_4bit
-___
-{
-my $cnt=$Htbl;	# $Htbl is used once in the very beginning
-
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
-
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
-
-$code.=<<___;
-#if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
-.fpu	neon
-
-.global	gcm_gmult_neon
-.type	gcm_gmult_neon,%function
-.align	4
-gcm_gmult_neon:
-	sub		$Htbl,#16		@ point at H in GCM128_CTX
-	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
-	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
-	vshr.u64	$mod,#32
-	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
-	veor		$zero,$zero
-#ifdef __ARMEL__
-	vrev64.8	$IN,$IN
-#endif
-	veor		$Qpost,$Qpost
-	veor		$R,$R
-	mov		$cnt,#16
-	veor		$Z,$Z
-	mov		$len,#16
-	veor		$Zo,$Zo
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-	b		.Linner_neon
-.size	gcm_gmult_neon,.-gcm_gmult_neon
-
-.global	gcm_ghash_neon
-.type	gcm_ghash_neon,%function
-.align	4
-gcm_ghash_neon:
-	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
-	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
-	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
-	vshr.u64	$mod,#32
-	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
-	veor		$zero,$zero
-	nop
-#ifdef __ARMEL__
-	vrev64.8	$Z,$Z
-#endif
-.Louter_neon:
-	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
-	veor		$Qpost,$Qpost
-	vld1.64		`&Dlo($IN)`,[$inp]!
-	veor		$R,$R
-	mov		$cnt,#16
-#ifdef __ARMEL__
-	vrev64.8	$IN,$IN
-#endif
-	veor		$Zo,$Zo
-	veor		$IN,$Z			@ inp^=Xi
-	veor		$Z,$Z
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-.Linner_neon:
-	subs		$cnt,$cnt,#1
-	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo�Xi[i]
-	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi�Xi[i]
-	vext.8		$IN,$zero,#1		@ IN>>=8
-
-	veor		$Z,$Qpost		@ modulo-scheduled part
-	vshl.i64	`&Dlo("$R")`,#48
-	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
-	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
-	veor		`&Dhi("$Z")`,`&Dlo("$R")`
-	vuzp.8		$Qlo,$Qhi
-	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
-	vext.8		$Z,$zero,#1		@ Z>>=8
-
-	vmull.p8	$R,$Zo,$mod		@ "carry"�0xe1
-	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
-	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
-	veor		$Z,$Qhi
-	bne		.Linner_neon
-
-	veor		$Z,$Qpost		@ modulo-scheduled artefact
-	vshl.i64	`&Dlo("$R")`,#48
-	veor		`&Dhi("$Z")`,`&Dlo("$R")`
-
-	@ finalization, normalize Z:Zo
-	vand		$Zo,$mod		@ suffices to mask the bit
-	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
-	vshl.i64	$Z,#1
-	subs		$len,#16
-	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
-	bne		.Louter_neon
-
-#ifdef __ARMEL__
-	vrev64.8	$Z,$Z
-#endif
-	sub		$Xi,#16	
-	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
-	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
-
-	bx	lr
-.size	gcm_ghash_neon,.-gcm_ghash_neon
-#endif
-___
-}
-$code.=<<___;
-.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
-.align  2
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
-print $code;
-close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
deleted file mode 100644
index 3f98513105..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-parisc.pl
+++ /dev/null
@@ -1,740 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# April 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
-# it processes one byte in 19.6 cycles, which is more than twice as
-# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
-# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
-# processed byte. This is ~2.2x faster than 64-bit code generated by
-# vendor compiler (which used to be very hard to beat:-).
-#
-# Special thanks to polarhome.com for providing HP-UX account.
-
-$flavour = shift;
-$output = shift;
-open STDOUT,">$output";
-
-if ($flavour =~ /64/) {
-	$LEVEL		="2.0W";
-	$SIZE_T		=8;
-	$FRAME_MARKER	=80;
-	$SAVED_RP	=16;
-	$PUSH		="std";
-	$PUSHMA		="std,ma";
-	$POP		="ldd";
-	$POPMB		="ldd,mb";
-	$NREGS		=6;
-} else {
-	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
-	$SIZE_T		=4;
-	$FRAME_MARKER	=48;
-	$SAVED_RP	=20;
-	$PUSH		="stw";
-	$PUSHMA		="stwm";
-	$POP		="ldw";
-	$POPMB		="ldwm";
-	$NREGS		=11;
-}
-
-$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
-				#                 [+ argument transfer]
-
-################# volatile registers
-$Xi="%r26";	# argument block
-$Htbl="%r25";
-$inp="%r24";
-$len="%r23";
-$Hhh=$Htbl;	# variables
-$Hll="%r22";
-$Zhh="%r21";
-$Zll="%r20";
-$cnt="%r19";
-$rem_4bit="%r28";
-$rem="%r29";
-$mask0xf0="%r31";
-
-################# preserved registers
-$Thh="%r1";
-$Tll="%r2";
-$nlo="%r3";
-$nhi="%r4";
-$byte="%r5";
-if ($SIZE_T==4) {
-	$Zhl="%r6";
-	$Zlh="%r7";
-	$Hhl="%r8";
-	$Hlh="%r9";
-	$Thl="%r10";
-	$Tlh="%r11";
-}
-$rem2="%r6";	# used in PA-RISC 2.0 code
-
-$code.=<<___;
-	.LEVEL	$LEVEL
-	.text
-
-	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
-	.ALIGN	64
-gcm_gmult_4bit
-	.PROC
-	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
-	.ENTRY
-	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
-	$PUSHMA	%r3,$FRAME(%sp)
-	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
-	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
-	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
-___
-$code.=<<___ if ($SIZE_T==4);
-	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
-	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
-	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
-	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
-	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
-___
-$code.=<<___;
-	addl	$inp,$len,$len
-#ifdef __PIC__
-	addil	LT'L\$rem_4bit, %r19
-	ldw	RT'L\$rem_4bit(%r1), $rem_4bit
-#else
-	ldil	L'L\$rem_4bit, %t1
-	ldo	R'L\$rem_4bit(%t1), $rem_4bit
-#endif
-	ldi	0xf0,$mask0xf0
-___
-$code.=<<___ if ($SIZE_T==4);
-#ifndef __OpenBSD__
-	ldi	31,$rem
-	mtctl	$rem,%cr11
-	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
-	b	L\$parisc1_gmult
-	nop
-___
-
-$code.=<<___;
-	ldb	15($Xi),$nlo
-	ldo	8($Htbl),$Hll
-
-	and	$mask0xf0,$nlo,$nhi
-	depd,z	$nlo,59,4,$nlo
-
-	ldd	$nlo($Hll),$Zll
-	ldd	$nlo($Hhh),$Zhh
-
-	depd,z	$Zll,60,4,$rem
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldb	14($Xi),$nlo
-
-	ldd	$nhi($Hll),$Tll
-	ldd	$nhi($Hhh),$Thh
-	and	$mask0xf0,$nlo,$nhi
-	depd,z	$nlo,59,4,$nlo
-
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-	ldd	$rem($rem_4bit),$rem
-	b	L\$oop_gmult_pa2
-	ldi	13,$cnt
-
-	.ALIGN	8
-L\$oop_gmult_pa2
-	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
-	depd,z	$Zll,60,4,$rem
-
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldd	$nlo($Hll),$Tll
-	ldd	$nlo($Hhh),$Thh
-
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-	ldd	$rem($rem_4bit),$rem
-
-	xor	$rem,$Zhh,$Zhh
-	depd,z	$Zll,60,4,$rem
-	ldbx	$cnt($Xi),$nlo
-
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldd	$nhi($Hll),$Tll
-	ldd	$nhi($Hhh),$Thh
-
-	and	$mask0xf0,$nlo,$nhi
-	depd,z	$nlo,59,4,$nlo
-	ldd	$rem($rem_4bit),$rem
-
-	xor	$Tll,$Zll,$Zll
-	addib,uv -1,$cnt,L\$oop_gmult_pa2
-	xor	$Thh,$Zhh,$Zhh
-
-	xor	$rem,$Zhh,$Zhh
-	depd,z	$Zll,60,4,$rem
-
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldd	$nlo($Hll),$Tll
-	ldd	$nlo($Hhh),$Thh
-
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-	ldd	$rem($rem_4bit),$rem
-
-	xor	$rem,$Zhh,$Zhh
-	depd,z	$Zll,60,4,$rem
-
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldd	$nhi($Hll),$Tll
-	ldd	$nhi($Hhh),$Thh
-
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-	ldd	$rem($rem_4bit),$rem
-
-	xor	$rem,$Zhh,$Zhh
-	std	$Zll,8($Xi)
-	std	$Zhh,0($Xi)
-___
-
-$code.=<<___ if ($SIZE_T==4);
-	b	L\$done_gmult
-	nop
-
-L\$parisc1_gmult
-#endif
-	ldb	15($Xi),$nlo
-	ldo	12($Htbl),$Hll
-	ldo	8($Htbl),$Hlh
-	ldo	4($Htbl),$Hhl
-
-	and	$mask0xf0,$nlo,$nhi
-	zdep	$nlo,27,4,$nlo
-
-	ldwx	$nlo($Hll),$Zll
-	ldwx	$nlo($Hlh),$Zlh
-	ldwx	$nlo($Hhl),$Zhl
-	ldwx	$nlo($Hhh),$Zhh
-	zdep	$Zll,28,4,$rem
-	ldb	14($Xi),$nlo
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$nhi($Hll),$Tll
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	ldwx	$nhi($Hlh),$Tlh
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	ldwx	$nhi($Hhl),$Thl
-	extru	$Zhh,27,28,$Zhh
-	ldwx	$nhi($Hhh),$Thh
-	xor	$rem,$Zhh,$Zhh
-	and	$mask0xf0,$nlo,$nhi
-	zdep	$nlo,27,4,$nlo
-
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nlo($Hll),$Tll
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nlo($Hlh),$Tlh
-	xor	$Thl,$Zhl,$Zhl
-	b	L\$oop_gmult_pa1
-	ldi	13,$cnt
-
-	.ALIGN	8
-L\$oop_gmult_pa1
-	zdep	$Zll,28,4,$rem
-	ldwx	$nlo($Hhl),$Thl
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$nlo($Hhh),$Thh
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	ldbx	$cnt($Xi),$nlo
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nhi($Hll),$Tll
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nhi($Hlh),$Tlh
-	extru	$Zhh,27,28,$Zhh
-	xor	$Thl,$Zhl,$Zhl
-	ldwx	$nhi($Hhl),$Thl
-	xor	$rem,$Zhh,$Zhh
-	zdep	$Zll,28,4,$rem
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$nhi($Hhh),$Thh
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	and	$mask0xf0,$nlo,$nhi
-	extru	$Zhh,27,28,$Zhh
-	zdep	$nlo,27,4,$nlo
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nlo($Hll),$Tll
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nlo($Hlh),$Tlh
-	xor	$rem,$Zhh,$Zhh
-	addib,uv -1,$cnt,L\$oop_gmult_pa1
-	xor	$Thl,$Zhl,$Zhl
-
-	zdep	$Zll,28,4,$rem
-	ldwx	$nlo($Hhl),$Thl
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$nlo($Hhh),$Thh
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nhi($Hll),$Tll
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nhi($Hlh),$Tlh
-	extru	$Zhh,27,28,$Zhh
-	xor	$rem,$Zhh,$Zhh
-	xor	$Thl,$Zhl,$Zhl
-	ldwx	$nhi($Hhl),$Thl
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$nhi($Hhh),$Thh
-	zdep	$Zll,28,4,$rem
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	extru	$Zhh,27,28,$Zhh
-	xor	$Tll,$Zll,$Zll
-	xor	$Tlh,$Zlh,$Zlh
-	xor	$rem,$Zhh,$Zhh
-	stw	$Zll,12($Xi)
-	xor	$Thl,$Zhl,$Zhl
-	stw	$Zlh,8($Xi)
-	xor	$Thh,$Zhh,$Zhh
-	stw	$Zhl,4($Xi)
-	stw	$Zhh,0($Xi)
-___
-$code.=<<___;
-L\$done_gmult
-	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
-	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
-	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
-	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
-___
-$code.=<<___ if ($SIZE_T==4);
-	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
-	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
-	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
-	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
-	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
-___
-$code.=<<___;
-	bv	(%r2)
-	.EXIT
-	$POPMB	-$FRAME(%sp),%r3
-	.PROCEND
-
-	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
-	.ALIGN	64
-gcm_ghash_4bit
-	.PROC
-	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
-	.ENTRY
-	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
-	$PUSHMA	%r3,$FRAME(%sp)
-	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
-	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
-	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
-___
-$code.=<<___ if ($SIZE_T==4);
-	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
-	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
-	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
-	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
-	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
-___
-$code.=<<___;
-	addl	$inp,$len,$len
-#ifdef __PIC__
-	addil	LT'L\$rem_4bit, %r19
-	ldw	RT'L\$rem_4bit(%r1), $rem_4bit
-#else
-	ldil	L'L\$rem_4bit, %t1
-	ldo	R'L\$rem_4bit(%t1), $rem_4bit
-#endif
-	ldi	0xf0,$mask0xf0
-___
-$code.=<<___ if ($SIZE_T==4);
-#ifndef __OpenBSD__
-	ldi	31,$rem
-	mtctl	$rem,%cr11
-	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
-	b	L\$parisc1_ghash
-	nop
-___
-
-$code.=<<___;
-	ldb	15($Xi),$nlo
-	ldo	8($Htbl),$Hll
-
-L\$outer_ghash_pa2
-	ldb	15($inp),$nhi
-	xor	$nhi,$nlo,$nlo
-	and	$mask0xf0,$nlo,$nhi
-	depd,z	$nlo,59,4,$nlo
-
-	ldd	$nlo($Hll),$Zll
-	ldd	$nlo($Hhh),$Zhh
-
-	depd,z	$Zll,60,4,$rem
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldb	14($Xi),$nlo
-	ldb	14($inp),$byte
-
-	ldd	$nhi($Hll),$Tll
-	ldd	$nhi($Hhh),$Thh
-	xor	$byte,$nlo,$nlo
-	and	$mask0xf0,$nlo,$nhi
-	depd,z	$nlo,59,4,$nlo
-
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-	ldd	$rem($rem_4bit),$rem
-	b	L\$oop_ghash_pa2
-	ldi	13,$cnt
-
-	.ALIGN	8
-L\$oop_ghash_pa2
-	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
-	depd,z	$Zll,60,4,$rem2
-
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldd	$nlo($Hll),$Tll
-	ldd	$nlo($Hhh),$Thh
-
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-	ldbx	$cnt($Xi),$nlo
-	ldbx	$cnt($inp),$byte
-
-	depd,z	$Zll,60,4,$rem
-	shrpd	$Zhh,$Zll,4,$Zll
-	ldd	$rem2($rem_4bit),$rem2
-
-	xor	$rem2,$Zhh,$Zhh
-	xor	$byte,$nlo,$nlo
-	ldd	$nhi($Hll),$Tll
-	ldd	$nhi($Hhh),$Thh
-
-	and	$mask0xf0,$nlo,$nhi
-	depd,z	$nlo,59,4,$nlo
-
-	extrd,u	$Zhh,59,60,$Zhh
-	xor	$Tll,$Zll,$Zll
-
-	ldd	$rem($rem_4bit),$rem
-	addib,uv -1,$cnt,L\$oop_ghash_pa2
-	xor	$Thh,$Zhh,$Zhh
-
-	xor	$rem,$Zhh,$Zhh
-	depd,z	$Zll,60,4,$rem2
-
-	shrpd	$Zhh,$Zll,4,$Zll
-	extrd,u	$Zhh,59,60,$Zhh
-	ldd	$nlo($Hll),$Tll
-	ldd	$nlo($Hhh),$Thh
-
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-
-	depd,z	$Zll,60,4,$rem
-	shrpd	$Zhh,$Zll,4,$Zll
-	ldd	$rem2($rem_4bit),$rem2
-
-	xor	$rem2,$Zhh,$Zhh
-	ldd	$nhi($Hll),$Tll
-	ldd	$nhi($Hhh),$Thh
-
-	extrd,u	$Zhh,59,60,$Zhh
-	xor	$Tll,$Zll,$Zll
-	xor	$Thh,$Zhh,$Zhh
-	ldd	$rem($rem_4bit),$rem
-
-	xor	$rem,$Zhh,$Zhh
-	std	$Zll,8($Xi)
-	ldo	16($inp),$inp
-	std	$Zhh,0($Xi)
-	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
-	copy	$Zll,$nlo
-___
-
-$code.=<<___ if ($SIZE_T==4);
-	b	L\$done_ghash
-	nop
-
-L\$parisc1_ghash
-#endif
-	ldb	15($Xi),$nlo
-	ldo	12($Htbl),$Hll
-	ldo	8($Htbl),$Hlh
-	ldo	4($Htbl),$Hhl
-
-L\$outer_ghash_pa1
-	ldb	15($inp),$byte
-	xor	$byte,$nlo,$nlo
-	and	$mask0xf0,$nlo,$nhi
-	zdep	$nlo,27,4,$nlo
-
-	ldwx	$nlo($Hll),$Zll
-	ldwx	$nlo($Hlh),$Zlh
-	ldwx	$nlo($Hhl),$Zhl
-	ldwx	$nlo($Hhh),$Zhh
-	zdep	$Zll,28,4,$rem
-	ldb	14($Xi),$nlo
-	ldb	14($inp),$byte
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$nhi($Hll),$Tll
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	ldwx	$nhi($Hlh),$Tlh
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	ldwx	$nhi($Hhl),$Thl
-	extru	$Zhh,27,28,$Zhh
-	ldwx	$nhi($Hhh),$Thh
-	xor	$byte,$nlo,$nlo
-	xor	$rem,$Zhh,$Zhh
-	and	$mask0xf0,$nlo,$nhi
-	zdep	$nlo,27,4,$nlo
-
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nlo($Hll),$Tll
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nlo($Hlh),$Tlh
-	xor	$Thl,$Zhl,$Zhl
-	b	L\$oop_ghash_pa1
-	ldi	13,$cnt
-
-	.ALIGN	8
-L\$oop_ghash_pa1
-	zdep	$Zll,28,4,$rem
-	ldwx	$nlo($Hhl),$Thl
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$nlo($Hhh),$Thh
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	ldbx	$cnt($Xi),$nlo
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nhi($Hll),$Tll
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	ldbx	$cnt($inp),$byte
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nhi($Hlh),$Tlh
-	extru	$Zhh,27,28,$Zhh
-	xor	$Thl,$Zhl,$Zhl
-	ldwx	$nhi($Hhl),$Thl
-	xor	$rem,$Zhh,$Zhh
-	zdep	$Zll,28,4,$rem
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$nhi($Hhh),$Thh
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	xor	$byte,$nlo,$nlo
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	and	$mask0xf0,$nlo,$nhi
-	extru	$Zhh,27,28,$Zhh
-	zdep	$nlo,27,4,$nlo
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nlo($Hll),$Tll
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nlo($Hlh),$Tlh
-	xor	$rem,$Zhh,$Zhh
-	addib,uv -1,$cnt,L\$oop_ghash_pa1
-	xor	$Thl,$Zhl,$Zhl
-
-	zdep	$Zll,28,4,$rem
-	ldwx	$nlo($Hhl),$Thl
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	ldwx	$nlo($Hhh),$Thh
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	xor	$Tll,$Zll,$Zll
-	ldwx	$nhi($Hll),$Tll
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	xor	$Tlh,$Zlh,$Zlh
-	ldwx	$nhi($Hlh),$Tlh
-	extru	$Zhh,27,28,$Zhh
-	xor	$rem,$Zhh,$Zhh
-	xor	$Thl,$Zhl,$Zhl
-	ldwx	$nhi($Hhl),$Thl
-	xor	$Thh,$Zhh,$Zhh
-	ldwx	$nhi($Hhh),$Thh
-	zdep	$Zll,28,4,$rem
-	ldwx	$rem($rem_4bit),$rem
-	shrpw	$Zlh,$Zll,4,$Zll
-	shrpw	$Zhl,$Zlh,4,$Zlh
-	shrpw	$Zhh,$Zhl,4,$Zhl
-	extru	$Zhh,27,28,$Zhh
-	xor	$Tll,$Zll,$Zll
-	xor	$Tlh,$Zlh,$Zlh
-	xor	$rem,$Zhh,$Zhh
-	stw	$Zll,12($Xi)
-	xor	$Thl,$Zhl,$Zhl
-	stw	$Zlh,8($Xi)
-	xor	$Thh,$Zhh,$Zhh
-	stw	$Zhl,4($Xi)
-	ldo	16($inp),$inp
-	stw	$Zhh,0($Xi)
-	comb,<>	$inp,$len,L\$outer_ghash_pa1
-	copy	$Zll,$nlo
-___
-$code.=<<___;
-L\$done_ghash
-	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
-	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
-	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
-	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
-___
-$code.=<<___ if ($SIZE_T==4);
-	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
-	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
-	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
-	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
-	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
-___
-$code.=<<___;
-	bv	(%r2)
-	.EXIT
-	$POPMB	-$FRAME(%sp),%r3
-	.PROCEND
-
-	.section .rodata
-	.ALIGN	64
-L\$rem_4bit
-	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
-	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
-	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
-	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
-	.previous
-
-	.ALIGN	64
-___
-
-# Explicitly encode PA-RISC 2.0 instructions used in this module, so
-# that it can be compiled with .LEVEL 1.0. It should be noted that I
-# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
-# directive...
-
-my $ldd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "ldd$mod\t$args";
-
-    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
-    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
-	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
-    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
-	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
-	$opcode|=(1<<5)  if ($mod =~ /^,m/);
-	$opcode|=(1<<13) if ($mod =~ /^,mb/);
-	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-
-my $std = sub {
-  my ($mod,$args) = @_;
-  my $orig = "std$mod\t$args";
-
-    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
-    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
-	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-
-my $extrd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "extrd$mod\t$args";
-
-    # I only have ",u" completer, it's implicitly encoded...
-    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
-    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
-	my $len=32-$3;
-	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
-	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
-	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
-    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
-	my $len=32-$2;
-	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
-	$opcode |= (1<<13) if ($mod =~ /,\**=/);
-	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-
-my $shrpd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "shrpd$mod\t$args";
-
-    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
-    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
-	my $cpos=63-$3;
-	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
-	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
-    {	sprintf "\t.WORD\t0x%08x\t; %s",
-		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
-    }
-    else { "\t".$orig; }
-};
-
-my $depd = sub {
-  my ($mod,$args) = @_;
-  my $orig = "depd$mod\t$args";
-
-    # I only have ",z" completer, it's implicitly encoded...
-    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
-    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
-    	my $cpos=63-$2;
-	my $len=32-$3;
-	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
-	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
-	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
-    }
-    else { "\t".$orig; }
-};
-
-sub assemble {
-  my ($mnemonic,$mod,$args)=@_;
-  my $opcode = eval("\$$mnemonic");
-
-    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
-}
-
-foreach (split("\n",$code)) {
-	s/\`([^\`]*)\`/eval $1/ge;
-	if ($SIZE_T==4) {
-		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
-		s/cmpb,\*/comb,/;
-		s/,\*/,/;
-	}
-	s/\bbv\b/bve/	if ($SIZE_T==8);
-	print $_,"\n";
-}
-
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
deleted file mode 100644
index ce75045f09..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
+++ /dev/null
@@ -1,351 +0,0 @@
-#!/usr/bin/env perl
-
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-
-# March 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+128 bytes shared table]. Performance
-# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
-# and are expressed in cycles per processed byte, less is better:
-#
-#		gcc 3.3.x	cc 5.2		this assembler
-#
-# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
-# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
-#
-# Here is data collected on UltraSPARC T1 system running Linux:
-#
-#		gcc 4.4.1			this assembler
-#
-# 32-bit build	566				50	(+1000%)
-# 64-bit build	56				50	(+12%)
-#
-# I don't quite understand why difference between 32-bit and 64-bit
-# compiler-generated code is so big. Compilers *were* instructed to
-# generate code for UltraSPARC and should have used 64-bit registers
-# for Z vector (see C code) even in 32-bit build... Oh well, it only
-# means more impressive improvement coefficients for this assembler
-# module;-) Loops are aggressively modulo-scheduled in respect to
-# references to input data and Z.hi updates to achieve 12 cycles
-# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
-# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
-
-$bits=32;
-for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
-if ($bits==64)  { $bias=2047; $frame=192; }
-else            { $bias=0;    $frame=112; }
-
-$output=shift;
-open STDOUT,">$output";
-
-$Zhi="%o0";	# 64-bit values
-$Zlo="%o1";
-$Thi="%o2";
-$Tlo="%o3";
-$rem="%o4";
-$tmp="%o5";
-
-$nhi="%l0";	# small values and pointers
-$nlo="%l1";
-$xi0="%l2";
-$xi1="%l3";
-$rem_4bit="%l4";
-$remi="%l5";
-$Htblo="%l6";
-$cnt="%l7";
-
-$Xi="%i0";	# input argument block
-$Htbl="%i1";
-$inp="%i2";
-$len="%i3";
-
-$code.=<<___;
-.section	".rodata",#alloc
-
-.align	64
-rem_4bit:
-	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
-	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
-	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
-	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
-.type	rem_4bit,#object
-.size	rem_4bit,(.-rem_4bit)
-
-.section	".text",#alloc,#execinstr
-.globl	gcm_ghash_4bit
-.align	32
-gcm_ghash_4bit:
-	save	%sp,-$frame,%sp
-#ifdef __PIC__
-	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), $tmp
-	rd	%pc, $rem
-	or	$tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp
-	add	$tmp, $rem, $tmp
-#endif
-
-	ldub	[$inp+15],$nlo
-	ldub	[$Xi+15],$xi0
-	ldub	[$Xi+14],$xi1
-	add	$len,$inp,$len
-	add	$Htbl,8,$Htblo
-
-#ifdef __PIC__
-	set	rem_4bit, $rem_4bit
-	ldx	[$rem_4bit+$tmp], $rem_4bit
-#else
-	set	rem_4bit, $rem_4bit
-#endif
-
-.Louter:
-	xor	$xi0,$nlo,$nlo
-	and	$nlo,0xf0,$nhi
-	and	$nlo,0x0f,$nlo
-	sll	$nlo,4,$nlo
-	ldx	[$Htblo+$nlo],$Zlo
-	ldx	[$Htbl+$nlo],$Zhi
-
-	ldub	[$inp+14],$nlo
-
-	ldx	[$Htblo+$nhi],$Tlo
-	and	$Zlo,0xf,$remi
-	ldx	[$Htbl+$nhi],$Thi
-	sll	$remi,3,$remi
-	ldx	[$rem_4bit+$remi],$rem
-	srlx	$Zlo,4,$Zlo
-	mov	13,$cnt
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-
-	xor	$xi1,$nlo,$nlo
-	and	$Zlo,0xf,$remi
-	and	$nlo,0xf0,$nhi
-	and	$nlo,0x0f,$nlo
-	ba	.Lghash_inner
-	sll	$nlo,4,$nlo
-.align	32
-.Lghash_inner:
-	ldx	[$Htblo+$nlo],$Tlo
-	sll	$remi,3,$remi
-	xor	$Thi,$Zhi,$Zhi
-	ldx	[$Htbl+$nlo],$Thi
-	srlx	$Zlo,4,$Zlo
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	ldub	[$inp+$cnt],$nlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-	ldub	[$Xi+$cnt],$xi1
-	xor	$Thi,$Zhi,$Zhi
-	and	$Zlo,0xf,$remi
-
-	ldx	[$Htblo+$nhi],$Tlo
-	sll	$remi,3,$remi
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$Htbl+$nhi],$Thi
-	srlx	$Zlo,4,$Zlo
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$xi1,$nlo,$nlo
-	srlx	$Zhi,4,$Zhi
-	and	$nlo,0xf0,$nhi
-	addcc	$cnt,-1,$cnt
-	xor	$Zlo,$tmp,$Zlo
-	and	$nlo,0x0f,$nlo
-	xor	$Tlo,$Zlo,$Zlo
-	sll	$nlo,4,$nlo
-	blu	.Lghash_inner
-	and	$Zlo,0xf,$remi
-
-	ldx	[$Htblo+$nlo],$Tlo
-	sll	$remi,3,$remi
-	xor	$Thi,$Zhi,$Zhi
-	ldx	[$Htbl+$nlo],$Thi
-	srlx	$Zlo,4,$Zlo
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-	xor	$Thi,$Zhi,$Zhi
-
-	add	$inp,16,$inp
-	cmp	$inp,$len
-	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
-	and	$Zlo,0xf,$remi
-
-	ldx	[$Htblo+$nhi],$Tlo
-	sll	$remi,3,$remi
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$Htbl+$nhi],$Thi
-	srlx	$Zlo,4,$Zlo
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	ldub	[$inp+15],$nlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-	xor	$Thi,$Zhi,$Zhi
-	stx	$Zlo,[$Xi+8]
-	xor	$rem,$Zhi,$Zhi
-	stx	$Zhi,[$Xi]
-	srl	$Zlo,8,$xi1
-	and	$Zlo,0xff,$xi0
-	ba	.Louter
-	and	$xi1,0xff,$xi1
-.align	32
-.Ldone:
-	ldx	[$Htblo+$nhi],$Tlo
-	sll	$remi,3,$remi
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$Htbl+$nhi],$Thi
-	srlx	$Zlo,4,$Zlo
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-	xor	$Thi,$Zhi,$Zhi
-	stx	$Zlo,[$Xi+8]
-	xor	$rem,$Zhi,$Zhi
-	stx	$Zhi,[$Xi]
-
-	ret
-	restore
-.type	gcm_ghash_4bit,#function
-.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
-___
-
-undef $inp;
-undef $len;
-
-$code.=<<___;
-.globl	gcm_gmult_4bit
-.align	32
-gcm_gmult_4bit:
-	save	%sp,-$frame,%sp
-#ifdef __PIC__
-	sethi	%hi(_GLOBAL_OFFSET_TABLE_-4), $tmp
-	rd	%pc, $rem
-	or	$tmp, %lo(_GLOBAL_OFFSET_TABLE_+4), $tmp
-	add	$tmp, $rem, $tmp
-#endif
-
-	ldub	[$Xi+15],$nlo
-	add	$Htbl,8,$Htblo
-
-#ifdef __PIC__
-	set	rem_4bit, $rem_4bit
-	ldx	[$rem_4bit+$tmp], $rem_4bit
-#else
-	set	rem_4bit, $rem_4bit
-#endif
-
-	and	$nlo,0xf0,$nhi
-	and	$nlo,0x0f,$nlo
-	sll	$nlo,4,$nlo
-	ldx	[$Htblo+$nlo],$Zlo
-	ldx	[$Htbl+$nlo],$Zhi
-
-	ldub	[$Xi+14],$nlo
-
-	ldx	[$Htblo+$nhi],$Tlo
-	and	$Zlo,0xf,$remi
-	ldx	[$Htbl+$nhi],$Thi
-	sll	$remi,3,$remi
-	ldx	[$rem_4bit+$remi],$rem
-	srlx	$Zlo,4,$Zlo
-	mov	13,$cnt
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-
-	and	$Zlo,0xf,$remi
-	and	$nlo,0xf0,$nhi
-	and	$nlo,0x0f,$nlo
-	ba	.Lgmult_inner
-	sll	$nlo,4,$nlo
-.align	32
-.Lgmult_inner:
-	ldx	[$Htblo+$nlo],$Tlo
-	sll	$remi,3,$remi
-	xor	$Thi,$Zhi,$Zhi
-	ldx	[$Htbl+$nlo],$Thi
-	srlx	$Zlo,4,$Zlo
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	ldub	[$Xi+$cnt],$nlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-	xor	$Thi,$Zhi,$Zhi
-	and	$Zlo,0xf,$remi
-
-	ldx	[$Htblo+$nhi],$Tlo
-	sll	$remi,3,$remi
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$Htbl+$nhi],$Thi
-	srlx	$Zlo,4,$Zlo
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	srlx	$Zhi,4,$Zhi
-	and	$nlo,0xf0,$nhi
-	addcc	$cnt,-1,$cnt
-	xor	$Zlo,$tmp,$Zlo
-	and	$nlo,0x0f,$nlo
-	xor	$Tlo,$Zlo,$Zlo
-	sll	$nlo,4,$nlo
-	blu	.Lgmult_inner
-	and	$Zlo,0xf,$remi
-
-	ldx	[$Htblo+$nlo],$Tlo
-	sll	$remi,3,$remi
-	xor	$Thi,$Zhi,$Zhi
-	ldx	[$Htbl+$nlo],$Thi
-	srlx	$Zlo,4,$Zlo
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-	xor	$Thi,$Zhi,$Zhi
-	and	$Zlo,0xf,$remi
-
-	ldx	[$Htblo+$nhi],$Tlo
-	sll	$remi,3,$remi
-	xor	$rem,$Zhi,$Zhi
-	ldx	[$Htbl+$nhi],$Thi
-	srlx	$Zlo,4,$Zlo
-	ldx	[$rem_4bit+$remi],$rem
-	sllx	$Zhi,60,$tmp
-	xor	$Tlo,$Zlo,$Zlo
-	srlx	$Zhi,4,$Zhi
-	xor	$Zlo,$tmp,$Zlo
-	xor	$Thi,$Zhi,$Zhi
-	stx	$Zlo,[$Xi+8]
-	xor	$rem,$Zhi,$Zhi
-	stx	$Zhi,[$Xi]
-
-	ret
-	restore
-.type	gcm_gmult_4bit,#function
-.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
-___
-
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-print $code;
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
deleted file mode 100644
index 47833582b6..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86.pl
+++ /dev/null
@@ -1,1326 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March, May, June 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that it
-# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
-# code paths: vanilla x86 and vanilla MMX. Former will be executed on
-# 486 and Pentium, latter on all others. MMX GHASH features so called
-# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
-# of per-key storage [+512 bytes shared table]. Performance results
-# are for streamed GHASH subroutine and are expressed in cycles per
-# processed byte, less is better:
-#
-#		gcc 2.95.3(*)	MMX assembler	x86 assembler
-#
-# Pentium	105/111(**)	-		50
-# PIII		68 /75		12.2		24
-# P4		125/125		17.8		84(***)
-# Opteron	66 /70		10.1		30
-# Core2		54 /67		8.4		18
-#
-# (*)	gcc 3.4.x was observed to generate few percent slower code,
-#	which is one of reasons why 2.95.3 results were chosen,
-#	another reason is lack of 3.4.x results for older CPUs;
-#	comparison with MMX results is not completely fair, because C
-#	results are for vanilla "256B" implementation, while
-#	assembler results are for "528B";-)
-# (**)	second number is result for code compiled with -fPIC flag,
-#	which is actually more relevant, because assembler code is
-#	position-independent;
-# (***)	see comment in non-MMX routine for further details;
-#
-# To summarize, it's >2-5 times faster than gcc-generated code. To
-# anchor it to something else SHA1 assembler processes one byte in
-# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
-# particular, see comment at the end of the file...
-
-# May 2010
-#
-# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
-# The question is how close is it to theoretical limit? The pclmulqdq
-# instruction latency appears to be 14 cycles and there can't be more
-# than 2 of them executing at any given time. This means that single
-# Karatsuba multiplication would take 28 cycles *plus* few cycles for
-# pre- and post-processing. Then multiplication has to be followed by
-# modulo-reduction. Given that aggregated reduction method [see
-# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
-# white paper by Intel] allows you to perform reduction only once in
-# a while we can assume that asymptotic performance can be estimated
-# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
-# and Naggr is the aggregation factor.
-#
-# Before we proceed to this implementation let's have closer look at
-# the best-performing code suggested by Intel in their white paper.
-# By tracing inter-register dependencies Tmod is estimated as ~19
-# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
-# processed byte. As implied, this is quite optimistic estimate,
-# because it does not account for Karatsuba pre- and post-processing,
-# which for a single multiplication is ~5 cycles. Unfortunately Intel
-# does not provide performance data for GHASH alone. But benchmarking
-# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
-# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
-# the result accounts even for pre-computing of degrees of the hash
-# key H, but its portion is negligible at 16KB buffer size.
-#
-# Moving on to the implementation in question. Tmod is estimated as
-# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
-# 2.16. How is it possible that measured performance is better than
-# optimistic theoretical estimate? There is one thing Intel failed
-# to recognize. By serializing GHASH with CTR in same subroutine
-# former's performance is really limited to above (Tmul + Tmod/Naggr)
-# equation. But if GHASH procedure is detached, the modulo-reduction
-# can be interleaved with Naggr-1 multiplications at instruction level
-# and under ideal conditions even disappear from the equation. So that
-# optimistic theoretical estimate for this implementation is ...
-# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
-# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
-# where Tproc is time required for Karatsuba pre- and post-processing,
-# is more realistic estimate. In this case it gives ... 1.91 cycles.
-# Or in other words, depending on how well we can interleave reduction
-# and one of the two multiplications the performance should be between
-# 1.91 and 2.16. As already mentioned, this implementation processes
-# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
-# - in 2.02. x86_64 performance is better, because larger register
-# bank allows to interleave reduction and multiplication better.
-#
-# Does it make sense to increase Naggr? To start with it's virtually
-# impossible in 32-bit mode, because of limited register bank
-# capacity. Otherwise improvement has to be weighed agiainst slower
-# setup, as well as code size and complexity increase. As even
-# optimistic estimate doesn't promise 30% performance improvement,
-# there are currently no plans to increase Naggr.
-#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
-
-# January 2010
-#
-# Tweaked to optimize transitions between integer and FP operations
-# on same XMM register, PCLMULQDQ subroutine was measured to process
-# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
-# The minor regression on Westmere is outweighed by ~15% improvement
-# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
-# similar manner resulted in almost 20% degradation on Sandy Bridge,
-# where original 64-bit code processes one byte in 1.95 cycles.
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-push(@INC,"${dir}","${dir}../../perlasm");
-require "x86asm.pl";
-
-&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
-
-$sse2=0;
-for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
-
-($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
-$inp  = "edi";
-$Htbl = "esi";
-
-$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
-		# than unrolled, which has to be weighted against
-		# 2.5x x86-specific code size reduction.
-
-sub x86_loop {
-    my $off = shift;
-    my $rem = "eax";
-
-	&mov	($Zhh,&DWP(4,$Htbl,$Zll));
-	&mov	($Zhl,&DWP(0,$Htbl,$Zll));
-	&mov	($Zlh,&DWP(12,$Htbl,$Zll));
-	&mov	($Zll,&DWP(8,$Htbl,$Zll));
-	&xor	($rem,$rem);	# avoid partial register stalls on PIII
-
-	# shrd practically kills P4, 2.5x deterioration, but P4 has
-	# MMX code-path to execute. shrd runs tad faster [than twice
-	# the shifts, move's and or's] on pre-MMX Pentium (as well as
-	# PIII and Core2), *but* minimizes code size, spares register
-	# and thus allows to fold the loop...
-	if (!$unroll) {
-	my $cnt = $inp;
-	&mov	($cnt,15);
-	&jmp	(&label("x86_loop"));
-	&set_label("x86_loop",16);
-	    for($i=1;$i<=2;$i++) {
-		&mov	(&LB($rem),&LB($Zll));
-		&shrd	($Zll,$Zlh,4);
-		&and	(&LB($rem),0xf);
-		&shrd	($Zlh,$Zhl,4);
-		&shrd	($Zhl,$Zhh,4);
-		&shr	($Zhh,4);
-		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
-
-		&mov	(&LB($rem),&BP($off,"esp",$cnt));
-		if ($i&1) {
-			&and	(&LB($rem),0xf0);
-		} else {
-			&shl	(&LB($rem),4);
-		}
-
-		&xor	($Zll,&DWP(8,$Htbl,$rem));
-		&xor	($Zlh,&DWP(12,$Htbl,$rem));
-		&xor	($Zhl,&DWP(0,$Htbl,$rem));
-		&xor	($Zhh,&DWP(4,$Htbl,$rem));
-
-		if ($i&1) {
-			&dec	($cnt);
-			&js	(&label("x86_break"));
-		} else {
-			&jmp	(&label("x86_loop"));
-		}
-	    }
-	&set_label("x86_break",16);
-	} else {
-	    for($i=1;$i<32;$i++) {
-		&comment($i);
-		&mov	(&LB($rem),&LB($Zll));
-		&shrd	($Zll,$Zlh,4);
-		&and	(&LB($rem),0xf);
-		&shrd	($Zlh,$Zhl,4);
-		&shrd	($Zhl,$Zhh,4);
-		&shr	($Zhh,4);
-		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
-
-		if ($i&1) {
-			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
-			&and	(&LB($rem),0xf0);
-		} else {
-			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
-			&shl	(&LB($rem),4);
-		}
-
-		&xor	($Zll,&DWP(8,$Htbl,$rem));
-		&xor	($Zlh,&DWP(12,$Htbl,$rem));
-		&xor	($Zhl,&DWP(0,$Htbl,$rem));
-		&xor	($Zhh,&DWP(4,$Htbl,$rem));
-	    }
-	}
-	&bswap	($Zll);
-	&bswap	($Zlh);
-	&bswap	($Zhl);
-	if (!$x86only) {
-		&bswap	($Zhh);
-	} else {
-		&mov	("eax",$Zhh);
-		&bswap	("eax");
-		&mov	($Zhh,"eax");
-	}
-}
-
-if ($unroll) {
-    &function_begin_B("_x86_gmult_4bit_inner");
-	&x86_loop(4);
-	&ret	();
-    &function_end_B("_x86_gmult_4bit_inner");
-}
-
-sub deposit_rem_4bit {
-    my $bias = shift;
-
-	&mov	(&DWP($bias+0, "esp"),0x0000<<16);
-	&mov	(&DWP($bias+4, "esp"),0x1C20<<16);
-	&mov	(&DWP($bias+8, "esp"),0x3840<<16);
-	&mov	(&DWP($bias+12,"esp"),0x2460<<16);
-	&mov	(&DWP($bias+16,"esp"),0x7080<<16);
-	&mov	(&DWP($bias+20,"esp"),0x6CA0<<16);
-	&mov	(&DWP($bias+24,"esp"),0x48C0<<16);
-	&mov	(&DWP($bias+28,"esp"),0x54E0<<16);
-	&mov	(&DWP($bias+32,"esp"),0xE100<<16);
-	&mov	(&DWP($bias+36,"esp"),0xFD20<<16);
-	&mov	(&DWP($bias+40,"esp"),0xD940<<16);
-	&mov	(&DWP($bias+44,"esp"),0xC560<<16);
-	&mov	(&DWP($bias+48,"esp"),0x9180<<16);
-	&mov	(&DWP($bias+52,"esp"),0x8DA0<<16);
-	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
-	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
-}
-
-$suffix = $x86only ? "" : "_x86";
-
-&function_begin("gcm_gmult_4bit".$suffix);
-	&stack_push(16+4+1);			# +1 for stack alignment
-	&mov	($inp,&wparam(0));		# load Xi
-	&mov	($Htbl,&wparam(1));		# load Htable
-
-	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
-	&mov	($Zhl,&DWP(4,$inp));
-	&mov	($Zlh,&DWP(8,$inp));
-	&mov	($Zll,&DWP(12,$inp));
-
-	&deposit_rem_4bit(16);
-
-	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
-	&mov	(&DWP(4,"esp"),$Zhl);
-	&mov	(&DWP(8,"esp"),$Zlh);
-	&mov	(&DWP(12,"esp"),$Zll);
-	&shr	($Zll,20);
-	&and	($Zll,0xf0);
-
-	if ($unroll) {
-		&call	("_x86_gmult_4bit_inner");
-	} else {
-		&x86_loop(0);
-		&mov	($inp,&wparam(0));
-	}
-
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(0,$inp),$Zhh);
-	&stack_pop(16+4+1);
-&function_end("gcm_gmult_4bit".$suffix);
-
-&function_begin("gcm_ghash_4bit".$suffix);
-	&stack_push(16+4+1);			# +1 for 64-bit alignment
-	&mov	($Zll,&wparam(0));		# load Xi
-	&mov	($Htbl,&wparam(1));		# load Htable
-	&mov	($inp,&wparam(2));		# load in
-	&mov	("ecx",&wparam(3));		# load len
-	&add	("ecx",$inp);
-	&mov	(&wparam(3),"ecx");
-
-	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
-	&mov	($Zhl,&DWP(4,$Zll));
-	&mov	($Zlh,&DWP(8,$Zll));
-	&mov	($Zll,&DWP(12,$Zll));
-
-	&deposit_rem_4bit(16);
-
-    &set_label("x86_outer_loop",16);
-	&xor	($Zll,&DWP(12,$inp));		# xor with input
-	&xor	($Zlh,&DWP(8,$inp));
-	&xor	($Zhl,&DWP(4,$inp));
-	&xor	($Zhh,&DWP(0,$inp));
-	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
-	&mov	(&DWP(8,"esp"),$Zlh);
-	&mov	(&DWP(4,"esp"),$Zhl);
-	&mov	(&DWP(0,"esp"),$Zhh);
-
-	&shr	($Zll,20);
-	&and	($Zll,0xf0);
-
-	if ($unroll) {
-		&call	("_x86_gmult_4bit_inner");
-	} else {
-		&x86_loop(0);
-		&mov	($inp,&wparam(2));
-	}
-	&lea	($inp,&DWP(16,$inp));
-	&cmp	($inp,&wparam(3));
-	&mov	(&wparam(2),$inp)	if (!$unroll);
-	&jb	(&label("x86_outer_loop"));
-
-	&mov	($inp,&wparam(0));	# load Xi
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(0,$inp),$Zhh);
-	&stack_pop(16+4+1);
-&function_end("gcm_ghash_4bit".$suffix);
-
-if (!$x86only) {{{
-
-&static_label("rem_4bit");
-
-if (!$sse2) {{	# pure-MMX "May" version...
-
-$S=12;		# shift factor for rem_4bit
-
-&function_begin_B("_mmx_gmult_4bit_inner");
-# MMX version performs 3.5 times better on P4 (see comment in non-MMX
-# routine for further details), 100% better on Opteron, ~70% better
-# on Core2 and PIII... In other words effort is considered to be well
-# spent... Since initial release the loop was unrolled in order to
-# "liberate" register previously used as loop counter. Instead it's
-# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
-# The path involves move of Z.lo from MMX to integer register,
-# effective address calculation and finally merge of value to Z.hi.
-# Reference to rem_4bit is scheduled so late that I had to >>4
-# rem_4bit elements. This resulted in 20-45% procent improvement
-# on contemporary �-archs.
-{
-    my $cnt;
-    my $rem_4bit = "eax";
-    my @rem = ($Zhh,$Zll);
-    my $nhi = $Zhl;
-    my $nlo = $Zlh;
-
-    my ($Zlo,$Zhi) = ("mm0","mm1");
-    my $tmp = "mm2";
-
-	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
-	&mov	($nhi,$Zll);
-	&mov	(&LB($nlo),&LB($nhi));
-	&shl	(&LB($nlo),4);
-	&and	($nhi,0xf0);
-	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
-	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
-	&movd	($rem[0],$Zlo);
-
-	for ($cnt=28;$cnt>=-2;$cnt--) {
-	    my $odd = $cnt&1;
-	    my $nix = $odd ? $nlo : $nhi;
-
-		&shl	(&LB($nlo),4)			if ($odd);
-		&psrlq	($Zlo,4);
-		&movq	($tmp,$Zhi);
-		&psrlq	($Zhi,4);
-		&pxor	($Zlo,&QWP(8,$Htbl,$nix));
-		&mov	(&LB($nlo),&BP($cnt/2,$inp))	if (!$odd && $cnt>=0);
-		&psllq	($tmp,60);
-		&and	($nhi,0xf0)			if ($odd);
-		&pxor	($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
-		&and	($rem[0],0xf);
-		&pxor	($Zhi,&QWP(0,$Htbl,$nix));
-		&mov	($nhi,$nlo)			if (!$odd && $cnt>=0);
-		&movd	($rem[1],$Zlo);
-		&pxor	($Zlo,$tmp);
-
-		push	(@rem,shift(@rem));		# "rotate" registers
-	}
-
-	&mov	($inp,&DWP(4,$rem_4bit,$rem[1],8));	# last rem_4bit[rem]
-
-	&psrlq	($Zlo,32);	# lower part of Zlo is already there
-	&movd	($Zhl,$Zhi);
-	&psrlq	($Zhi,32);
-	&movd	($Zlh,$Zlo);
-	&movd	($Zhh,$Zhi);
-	&shl	($inp,4);	# compensate for rem_4bit[i] being >>4
-
-	&bswap	($Zll);
-	&bswap	($Zhl);
-	&bswap	($Zlh);
-	&xor	($Zhh,$inp);
-	&bswap	($Zhh);
-
-	&ret	();
-}
-&function_end_B("_mmx_gmult_4bit_inner");
-
-&function_begin("gcm_gmult_4bit_mmx");
-	&mov	($inp,&wparam(0));	# load Xi
-	&mov	($Htbl,&wparam(1));	# load Htable
-
-	&picsetup("eax");
-	&picsymbol("eax", &label("rem_4bit"), "eax");
-
-	&movz	($Zll,&BP(15,$inp));
-
-	&call	("_mmx_gmult_4bit_inner");
-
-	&mov	($inp,&wparam(0));	# load Xi
-	&emms	();
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-
-# Streamed version performs 20% better on P4, 7% on Opteron,
-# 10% on Core2 and PIII...
-&function_begin("gcm_ghash_4bit_mmx");
-	&mov	($Zhh,&wparam(0));	# load Xi
-	&mov	($Htbl,&wparam(1));	# load Htable
-	&mov	($inp,&wparam(2));	# load in
-	&mov	($Zlh,&wparam(3));	# load len
-
-	&picsetup("eax");
-	&picsymbol("eax", &label("rem_4bit"), "eax");
-
-	&add	($Zlh,$inp);
-	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
-	&stack_push(4+1);		# +1 for stack alignment
-
-	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
-	&mov	($Zhl,&DWP(4,$Zhh));
-	&mov	($Zlh,&DWP(8,$Zhh));
-	&mov	($Zhh,&DWP(0,$Zhh));
-	&jmp	(&label("mmx_outer_loop"));
-
-    &set_label("mmx_outer_loop",16);
-	&xor	($Zll,&DWP(12,$inp));
-	&xor	($Zhl,&DWP(4,$inp));
-	&xor	($Zlh,&DWP(8,$inp));
-	&xor	($Zhh,&DWP(0,$inp));
-	&mov	(&wparam(2),$inp);
-	&mov	(&DWP(12,"esp"),$Zll);
-	&mov	(&DWP(4,"esp"),$Zhl);
-	&mov	(&DWP(8,"esp"),$Zlh);
-	&mov	(&DWP(0,"esp"),$Zhh);
-
-	&mov	($inp,"esp");
-	&shr	($Zll,24);
-
-	&call	("_mmx_gmult_4bit_inner");
-
-	&mov	($inp,&wparam(2));
-	&lea	($inp,&DWP(16,$inp));
-	&cmp	($inp,&wparam(3));
-	&jb	(&label("mmx_outer_loop"));
-
-	&mov	($inp,&wparam(0));	# load Xi
-	&emms	();
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(0,$inp),$Zhh);
-
-	&stack_pop(4+1);
-&function_end("gcm_ghash_4bit_mmx");
-
-}} else {{	# "June" MMX version...
-		# ... has slower "April" gcm_gmult_4bit_mmx with folded
-		# loop. This is done to conserve code size...
-$S=16;		# shift factor for rem_4bit
-
-sub mmx_loop() {
-# MMX version performs 2.8 times better on P4 (see comment in non-MMX
-# routine for further details), 40% better on Opteron and Core2, 50%
-# better on PIII... In other words effort is considered to be well
-# spent...
-    my $inp = shift;
-    my $rem_4bit = shift;
-    my $cnt = $Zhh;
-    my $nhi = $Zhl;
-    my $nlo = $Zlh;
-    my $rem = $Zll;
-
-    my ($Zlo,$Zhi) = ("mm0","mm1");
-    my $tmp = "mm2";
-
-	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
-	&mov	($nhi,$Zll);
-	&mov	(&LB($nlo),&LB($nhi));
-	&mov	($cnt,14);
-	&shl	(&LB($nlo),4);
-	&and	($nhi,0xf0);
-	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
-	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
-	&movd	($rem,$Zlo);
-	&jmp	(&label("mmx_loop"));
-
-    &set_label("mmx_loop",16);
-	&psrlq	($Zlo,4);
-	&and	($rem,0xf);
-	&movq	($tmp,$Zhi);
-	&psrlq	($Zhi,4);
-	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
-	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
-	&psllq	($tmp,60);
-	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
-	&dec	($cnt);
-	&movd	($rem,$Zlo);
-	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
-	&mov	($nhi,$nlo);
-	&pxor	($Zlo,$tmp);
-	&js	(&label("mmx_break"));
-
-	&shl	(&LB($nlo),4);
-	&and	($rem,0xf);
-	&psrlq	($Zlo,4);
-	&and	($nhi,0xf0);
-	&movq	($tmp,$Zhi);
-	&psrlq	($Zhi,4);
-	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
-	&psllq	($tmp,60);
-	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
-	&movd	($rem,$Zlo);
-	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
-	&pxor	($Zlo,$tmp);
-	&jmp	(&label("mmx_loop"));
-
-    &set_label("mmx_break",16);
-	&shl	(&LB($nlo),4);
-	&and	($rem,0xf);
-	&psrlq	($Zlo,4);
-	&and	($nhi,0xf0);
-	&movq	($tmp,$Zhi);
-	&psrlq	($Zhi,4);
-	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
-	&psllq	($tmp,60);
-	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
-	&movd	($rem,$Zlo);
-	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
-	&pxor	($Zlo,$tmp);
-
-	&psrlq	($Zlo,4);
-	&and	($rem,0xf);
-	&movq	($tmp,$Zhi);
-	&psrlq	($Zhi,4);
-	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
-	&psllq	($tmp,60);
-	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
-	&movd	($rem,$Zlo);
-	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
-	&pxor	($Zlo,$tmp);
-
-	&psrlq	($Zlo,32);	# lower part of Zlo is already there
-	&movd	($Zhl,$Zhi);
-	&psrlq	($Zhi,32);
-	&movd	($Zlh,$Zlo);
-	&movd	($Zhh,$Zhi);
-
-	&bswap	($Zll);
-	&bswap	($Zhl);
-	&bswap	($Zlh);
-	&bswap	($Zhh);
-}
-
-&function_begin("gcm_gmult_4bit_mmx");
-	&mov	($inp,&wparam(0));	# load Xi
-	&mov	($Htbl,&wparam(1));	# load Htable
-
-	&picsetup("eax");
-	&picsymbol("eax", &label("rem_4bit"), "eax");
-
-	&movz	($Zll,&BP(15,$inp));
-
-	&mmx_loop($inp,"eax");
-
-	&emms	();
-	&mov	(&DWP(12,$inp),$Zll);
-	&mov	(&DWP(4,$inp),$Zhl);
-	&mov	(&DWP(8,$inp),$Zlh);
-	&mov	(&DWP(0,$inp),$Zhh);
-&function_end("gcm_gmult_4bit_mmx");
-
-######################################################################
-# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
-# (see gcm128.c for details). It provides further 20-40% performance
-# improvement over above mentioned "May" version.
-
-&static_label("rem_8bit");
-
-&function_begin("gcm_ghash_4bit_mmx");
-{ my ($Zlo,$Zhi) = ("mm7","mm6");
-  my $rem_8bit = "esi";
-  my $Htbl = "ebx";
-
-    # parameter block
-    &mov	("eax",&wparam(0));		# Xi
-    &mov	("ebx",&wparam(1));		# Htable
-    &mov	("ecx",&wparam(2));		# inp
-    &mov	("edx",&wparam(3));		# len
-    &mov	("ebp","esp");			# original %esp
-
-    &picsetup($rem_8bit);
-    &picsymbol($rem_8bit, &label("rem_8bit"), $rem_8bit);
-
-    &sub	("esp",512+16+16);		# allocate stack frame...
-    &and	("esp",-64);			# ...and align it
-    &sub	("esp",16);			# place for (u8)(H[]<<4)
-
-    &add	("edx","ecx");			# pointer to the end of input
-    &mov	(&DWP(528+16+0,"esp"),"eax");	# save Xi
-    &mov	(&DWP(528+16+8,"esp"),"edx");	# save inp+len
-    &mov	(&DWP(528+16+12,"esp"),"ebp");	# save original %esp
-
-    { my @lo  = ("mm0","mm1","mm2");
-      my @hi  = ("mm3","mm4","mm5");
-      my @tmp = ("mm6","mm7");
-      my ($off1,$off2,$i) = (0,0,);
-
-      &add	($Htbl,128);			# optimize for size
-      &lea	("edi",&DWP(16+128,"esp"));
-      &lea	("ebp",&DWP(16+256+128,"esp"));
-
-      # decompose Htable (low and high parts are kept separately),
-      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
-      for ($i=0;$i<18;$i++) {
-
-	&mov	("edx",&DWP(16*$i+8-128,$Htbl))		if ($i<16);
-	&movq	($lo[0],&QWP(16*$i+8-128,$Htbl))	if ($i<16);
-	&psllq	($tmp[1],60)				if ($i>1);
-	&movq	($hi[0],&QWP(16*$i+0-128,$Htbl))	if ($i<16);
-	&por	($lo[2],$tmp[1])			if ($i>1);
-	&movq	(&QWP($off1-128,"edi"),$lo[1])		if ($i>0 && $i<17);
-	&psrlq	($lo[1],4)				if ($i>0 && $i<17);
-	&movq	(&QWP($off1,"edi"),$hi[1])		if ($i>0 && $i<17);
-	&movq	($tmp[0],$hi[1])			if ($i>0 && $i<17);
-	&movq	(&QWP($off2-128,"ebp"),$lo[2])		if ($i>1);
-	&psrlq	($hi[1],4)				if ($i>0 && $i<17);
-	&movq	(&QWP($off2,"ebp"),$hi[2])		if ($i>1);
-	&shl	("edx",4)				if ($i<16);
-	&mov	(&BP($i,"esp"),&LB("edx"))		if ($i<16);
-
-	unshift	(@lo,pop(@lo));			# "rotate" registers
-	unshift	(@hi,pop(@hi));
-	unshift	(@tmp,pop(@tmp));
-	$off1 += 8	if ($i>0);
-	$off2 += 8	if ($i>1);
-      }
-    }
-
-    &movq	($Zhi,&QWP(0,"eax"));
-    &mov	("ebx",&DWP(8,"eax"));
-    &mov	("edx",&DWP(12,"eax"));		# load Xi
-
-&set_label("outer",16);
-  { my $nlo = "eax";
-    my $dat = "edx";
-    my @nhi = ("edi","ebp");
-    my @rem = ("ebx","ecx");
-    my @red = ("mm0","mm1","mm2");
-    my $tmp = "mm3";
-
-    &xor	($dat,&DWP(12,"ecx"));		# merge input data
-    &xor	("ebx",&DWP(8,"ecx"));
-    &pxor	($Zhi,&QWP(0,"ecx"));
-    &lea	("ecx",&DWP(16,"ecx"));		# inp+=16
-    #&mov	(&DWP(528+12,"esp"),$dat);	# save inp^Xi
-    &mov	(&DWP(528+8,"esp"),"ebx");
-    &movq	(&QWP(528+0,"esp"),$Zhi);
-    &mov	(&DWP(528+16+4,"esp"),"ecx");	# save inp
-
-    &xor	($nlo,$nlo);
-    &rol	($dat,8);
-    &mov	(&LB($nlo),&LB($dat));
-    &mov	($nhi[1],$nlo);
-    &and	(&LB($nlo),0x0f);
-    &shr	($nhi[1],4);
-    &pxor	($red[0],$red[0]);
-    &rol	($dat,8);			# next byte
-    &pxor	($red[1],$red[1]);
-    &pxor	($red[2],$red[2]);
-
-    # Just like in "May" version modulo-schedule for critical path in
-    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
-    # is scheduled so late that rem_8bit[] has to be shifted *right*
-    # by 16, which is why last argument to pinsrw is 2, which
-    # corresponds to <<32=<<48>>16...
-    for ($j=11,$i=0;$i<15;$i++) {
-
-      if ($i>0) {
-	&pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
-	&rol	($dat,8);				# next byte
-	&pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
-
-	&pxor	($Zlo,$tmp);
-	&pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
-	&xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
-      } else {
-	&movq	($Zlo,&QWP(16,"esp",$nlo,8));
-	&movq	($Zhi,&QWP(16+128,"esp",$nlo,8));
-      }
-
-	&mov	(&LB($nlo),&LB($dat));
-	&mov	($dat,&DWP(528+$j,"esp"))		if (--$j%4==0 && $j>=0);
-
-	&movd	($rem[0],$Zlo);
-	&movz	($rem[1],&LB($rem[1]))			if ($i>0);
-	&psrlq	($Zlo,8);				# Z>>=8
-
-	&movq	($tmp,$Zhi);
-	&mov	($nhi[0],$nlo);
-	&psrlq	($Zhi,8);
-
-	&pxor	($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));	# Z^=H[nhi]>>4
-	&and	(&LB($nlo),0x0f);
-	&psllq	($tmp,56);
-
-	&pxor	($Zhi,$red[1])				if ($i>1);
-	&shr	($nhi[0],4);
-	&pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2)	if ($i>0);
-
-	unshift	(@red,pop(@red));			# "rotate" registers
-	unshift	(@rem,pop(@rem));
-	unshift	(@nhi,pop(@nhi));
-    }
-
-    &pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
-    &pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
-    &xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
-
-    &pxor	($Zlo,$tmp);
-    &pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
-    &movz	($rem[1],&LB($rem[1]));
-
-    &pxor	($red[2],$red[2]);			# clear 2nd word
-    &psllq	($red[1],4);
-
-    &movd	($rem[0],$Zlo);
-    &psrlq	($Zlo,4);				# Z>>=4
-
-    &movq	($tmp,$Zhi);
-    &psrlq	($Zhi,4);
-    &shl	($rem[0],4);				# rem<<4
-
-    &pxor	($Zlo,&QWP(16,"esp",$nhi[1],8));	# Z^=H[nhi]
-    &psllq	($tmp,60);
-    &movz	($rem[0],&LB($rem[0]));
-
-    &pxor	($Zlo,$tmp);
-    &pxor	($Zhi,&QWP(16+128,"esp",$nhi[1],8));
-
-    &pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
-    &pxor	($Zhi,$red[1]);
-
-    &movd	($dat,$Zlo);
-    &pinsrw	($red[2],&WP(0,$rem_8bit,$rem[0],2),3);	# last is <<48
-
-    &psllq	($red[0],12);				# correct by <<16>>4
-    &pxor	($Zhi,$red[0]);
-    &psrlq	($Zlo,32);
-    &pxor	($Zhi,$red[2]);
-
-    &mov	("ecx",&DWP(528+16+4,"esp"));	# restore inp
-    &movd	("ebx",$Zlo);
-    &movq	($tmp,$Zhi);			# 01234567
-    &psllw	($Zhi,8);			# 1.3.5.7.
-    &psrlw	($tmp,8);			# .0.2.4.6
-    &por	($Zhi,$tmp);			# 10325476
-    &bswap	($dat);
-    &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
-    &bswap	("ebx");
-    
-    &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
-    &jne	(&label("outer"));
-  }
-
-    &mov	("eax",&DWP(528+16+0,"esp"));	# restore Xi
-    &mov	(&DWP(12,"eax"),"edx");
-    &mov	(&DWP(8,"eax"),"ebx");
-    &movq	(&QWP(0,"eax"),$Zhi);
-
-    &mov	("esp",&DWP(528+16+12,"esp"));	# restore original %esp
-    &emms	();
-}
-&function_end("gcm_ghash_4bit_mmx");
-}}
-
-if ($sse2) {{
-######################################################################
-# PCLMULQDQ version.
-
-$Xip="eax";
-$Htbl="edx";
-$const="ecx";
-$inp="esi";
-$len="ebx";
-
-($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
-($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
-($Xn,$Xhn)=("xmm6","xmm7");
-
-&static_label("bswap");
-
-sub clmul64x64_T2 {	# minimal "register" pressure
-my ($Xhi,$Xi,$Hkey)=@_;
-
-	&movdqa		($Xhi,$Xi);		#
-	&pshufd		($T1,$Xi,0b01001110);
-	&pshufd		($T2,$Hkey,0b01001110);
-	&pxor		($T1,$Xi);		#
-	&pxor		($T2,$Hkey);
-
-	&pclmulqdq	($Xi,$Hkey,0x00);	#######
-	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
-	&pclmulqdq	($T1,$T2,0x00);		#######
-	&xorps		($T1,$Xi);		#
-	&xorps		($T1,$Xhi);		#
-
-	&movdqa		($T2,$T1);		#
-	&psrldq		($T1,8);
-	&pslldq		($T2,8);		#
-	&pxor		($Xhi,$T1);
-	&pxor		($Xi,$T2);		#
-}
-
-sub clmul64x64_T3 {
-# Even though this subroutine offers visually better ILP, it
-# was empirically found to be a tad slower than above version.
-# At least in gcm_ghash_clmul context. But it's just as well,
-# because loop modulo-scheduling is possible only thanks to
-# minimized "register" pressure...
-my ($Xhi,$Xi,$Hkey)=@_;
-
-	&movdqa		($T1,$Xi);		#
-	&movdqa		($Xhi,$Xi);
-	&pclmulqdq	($Xi,$Hkey,0x00);	#######
-	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
-	&pshufd		($T2,$T1,0b01001110);	#
-	&pshufd		($T3,$Hkey,0b01001110);
-	&pxor		($T2,$T1);		#
-	&pxor		($T3,$Hkey);
-	&pclmulqdq	($T2,$T3,0x00);		#######
-	&pxor		($T2,$Xi);		#
-	&pxor		($T2,$Xhi);		#
-
-	&movdqa		($T3,$T2);		#
-	&psrldq		($T2,8);
-	&pslldq		($T3,8);		#
-	&pxor		($Xhi,$T2);
-	&pxor		($Xi,$T3);		#
-}
-
-if (1) {		# Algorithm 9 with <<1 twist.
-			# Reduction is shorter and uses only two
-			# temporary registers, which makes it better
-			# candidate for interleaving with 64x64
-			# multiplication. Pre-modulo-scheduled loop
-			# was found to be ~20% faster than Algorithm 5
-			# below. Algorithm 9 was therefore chosen for
-			# further optimization...
-
-sub reduction_alg9 {	# 17/13 times faster than Intel version
-my ($Xhi,$Xi) = @_;
-
-	# 1st phase
-	&movdqa		($T1,$Xi);		#
-	&psllq		($Xi,1);
-	&pxor		($Xi,$T1);		#
-	&psllq		($Xi,5);		#
-	&pxor		($Xi,$T1);		#
-	&psllq		($Xi,57);		#
-	&movdqa		($T2,$Xi);		#
-	&pslldq		($Xi,8);
-	&psrldq		($T2,8);		#
-	&pxor		($Xi,$T1);
-	&pxor		($Xhi,$T2);		#
-
-	# 2nd phase
-	&movdqa		($T2,$Xi);
-	&psrlq		($Xi,5);
-	&pxor		($Xi,$T2);		#
-	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
-	&pxor		($T2,$Xhi);
-	&psrlq		($Xi,1);		#
-	&pxor		($Xi,$T2);		#
-}
-
-&function_begin_B("gcm_init_clmul");
-	&mov		($Htbl,&wparam(0));
-	&mov		($Xip,&wparam(1));
-
-	&picsetup($const);
-	&picsymbol($const, &label("bswap"), $const);
-
-	&movdqu		($Hkey,&QWP(0,$Xip));
-	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
-
-	# <<1 twist
-	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
-	&movdqa		($T1,$Hkey);
-	&psllq		($Hkey,1);
-	&pxor		($T3,$T3);		#
-	&psrlq		($T1,63);
-	&pcmpgtd	($T3,$T2);		# broadcast carry bit
-	&pslldq		($T1,8);
-	&por		($Hkey,$T1);		# H<<=1
-
-	# magic reduction
-	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
-	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
-
-	# calculate H^2
-	&movdqa		($Xi,$Hkey);
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
-	&reduction_alg9	($Xhi,$Xi);
-
-	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
-	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
-
-	&ret		();
-&function_end_B("gcm_init_clmul");
-
-&function_begin_B("gcm_gmult_clmul");
-	&mov		($Xip,&wparam(0));
-	&mov		($Htbl,&wparam(1));
-
-	&picsetup($const);
-	&picsymbol($const, &label("bswap"), $const);
-
-	&movdqu		($Xi,&QWP(0,$Xip));
-	&movdqa		($T3,&QWP(0,$const));
-	&movups		($Hkey,&QWP(0,$Htbl));
-	&pshufb		($Xi,$T3);
-
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
-	&reduction_alg9	($Xhi,$Xi);
-
-	&pshufb		($Xi,$T3);
-	&movdqu		(&QWP(0,$Xip),$Xi);
-
-	&ret	();
-&function_end_B("gcm_gmult_clmul");
-
-&function_begin("gcm_ghash_clmul");
-	&mov		($Xip,&wparam(0));
-	&mov		($Htbl,&wparam(1));
-	&mov		($inp,&wparam(2));
-	&mov		($len,&wparam(3));
-
-	&picsetup($const);
-	&picsymbol($const, &label("bswap"), $const);
-
-	&movdqu		($Xi,&QWP(0,$Xip));
-	&movdqa		($T3,&QWP(0,$const));
-	&movdqu		($Hkey,&QWP(0,$Htbl));
-	&pshufb		($Xi,$T3);
-
-	&sub		($len,0x10);
-	&jz		(&label("odd_tail"));
-
-	#######
-	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
-	#	[(H*Ii+1) + (H*Xi+1)] mod P =
-	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
-	#
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
-	&pshufb		($T1,$T3);
-	&pshufb		($Xn,$T3);
-	&pxor		($Xi,$T1);		# Ii+Xi
-
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
-	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
-
-	&lea		($inp,&DWP(32,$inp));	# i+=2
-	&sub		($len,0x20);
-	&jbe		(&label("even_tail"));
-
-&set_label("mod_loop");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&movups		($Hkey,&QWP(0,$Htbl));	# load H
-
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
-
-	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
-	&pshufb		($T1,$T3);
-	&pshufb		($Xn,$T3);
-
-	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
-	&movdqa		($Xhn,$Xn);
-	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
-
-	  &movdqa	($T1,$Xi);		#&reduction_alg9($Xhi,$Xi); 1st phase
-	  &psllq	($Xi,1);
-	  &pxor		($Xi,$T1);		#
-	  &psllq	($Xi,5);		#
-	  &pxor		($Xi,$T1);		#
-	&pclmulqdq	($Xn,$Hkey,0x00);	#######
-	  &psllq	($Xi,57);		#
-	  &movdqa	($T2,$Xi);		#
-	  &pslldq	($Xi,8);
-	  &psrldq	($T2,8);		#	
-	  &pxor		($Xi,$T1);
-	&pshufd		($T1,$T3,0b01001110);
-	  &pxor		($Xhi,$T2);		#
-	&pxor		($T1,$T3);
-	&pshufd		($T3,$Hkey,0b01001110);
-	&pxor		($T3,$Hkey);		#
-
-	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
-	  &movdqa	($T2,$Xi);		# 2nd phase
-	  &psrlq	($Xi,5);
-	  &pxor		($Xi,$T2);		#
-	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-	  &pxor		($T2,$Xhi);
-	  &psrlq	($Xi,1);		#
-	  &pxor		($Xi,$T2);		#
-
-	&pclmulqdq	($T1,$T3,0x00);		#######
-	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
-	&xorps		($T1,$Xn);		#
-	&xorps		($T1,$Xhn);		#
-
-	&movdqa		($T3,$T1);		#
-	&psrldq		($T1,8);
-	&pslldq		($T3,8);		#
-	&pxor		($Xhn,$T1);
-	&pxor		($Xn,$T3);		#
-	&movdqa		($T3,&QWP(0,$const));
-
-	&lea		($inp,&DWP(32,$inp));
-	&sub		($len,0x20);
-	&ja		(&label("mod_loop"));
-
-&set_label("even_tail");
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
-
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
-
-	&reduction_alg9	($Xhi,$Xi);
-
-	&test		($len,$len);
-	&jnz		(&label("done"));
-
-	&movups		($Hkey,&QWP(0,$Htbl));	# load H
-&set_label("odd_tail");
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&pshufb		($T1,$T3);
-	&pxor		($Xi,$T1);		# Ii+Xi
-
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
-	&reduction_alg9	($Xhi,$Xi);
-
-&set_label("done");
-	&pshufb		($Xi,$T3);
-	&movdqu		(&QWP(0,$Xip),$Xi);
-&function_end("gcm_ghash_clmul");
-
-} else {		# Algorithm 5. Kept for reference purposes.
-
-sub reduction_alg5 {	# 19/16 times faster than Intel version
-my ($Xhi,$Xi)=@_;
-
-	# <<1
-	&movdqa		($T1,$Xi);		#
-	&movdqa		($T2,$Xhi);
-	&pslld		($Xi,1);
-	&pslld		($Xhi,1);		#
-	&psrld		($T1,31);
-	&psrld		($T2,31);		#
-	&movdqa		($T3,$T1);
-	&pslldq		($T1,4);
-	&psrldq		($T3,12);		#
-	&pslldq		($T2,4);
-	&por		($Xhi,$T3);		#
-	&por		($Xi,$T1);
-	&por		($Xhi,$T2);		#
-
-	# 1st phase
-	&movdqa		($T1,$Xi);
-	&movdqa		($T2,$Xi);
-	&movdqa		($T3,$Xi);		#
-	&pslld		($T1,31);
-	&pslld		($T2,30);
-	&pslld		($Xi,25);		#
-	&pxor		($T1,$T2);
-	&pxor		($T1,$Xi);		#
-	&movdqa		($T2,$T1);		#
-	&pslldq		($T1,12);
-	&psrldq		($T2,4);		#
-	&pxor		($T3,$T1);
-
-	# 2nd phase
-	&pxor		($Xhi,$T3);		#
-	&movdqa		($Xi,$T3);
-	&movdqa		($T1,$T3);
-	&psrld		($Xi,1);		#
-	&psrld		($T1,2);
-	&psrld		($T3,7);		#
-	&pxor		($Xi,$T1);
-	&pxor		($Xhi,$T2);
-	&pxor		($Xi,$T3);		#
-	&pxor		($Xi,$Xhi);		#
-}
-
-&function_begin_B("gcm_init_clmul");
-	&mov		($Htbl,&wparam(0));
-	&mov		($Xip,&wparam(1));
-
-	&picsetup($const);
-	&picsymbol($const, &label("bswap"), $const);
-
-	&movdqu		($Hkey,&QWP(0,$Xip));
-	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
-
-	# calculate H^2
-	&movdqa		($Xi,$Hkey);
-	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
-	&reduction_alg5	($Xhi,$Xi);
-
-	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
-	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
-
-	&ret		();
-&function_end_B("gcm_init_clmul");
-
-&function_begin_B("gcm_gmult_clmul");
-	&mov		($Xip,&wparam(0));
-	&mov		($Htbl,&wparam(1));
-
-	&picsetup($const);
-	&picsymbol($const, &label("bswap"), $const);
-
-	&movdqu		($Xi,&QWP(0,$Xip));
-	&movdqa		($Xn,&QWP(0,$const));
-	&movdqu		($Hkey,&QWP(0,$Htbl));
-	&pshufb		($Xi,$Xn);
-
-	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
-	&reduction_alg5	($Xhi,$Xi);
-
-	&pshufb		($Xi,$Xn);
-	&movdqu		(&QWP(0,$Xip),$Xi);
-
-	&ret	();
-&function_end_B("gcm_gmult_clmul");
-
-&function_begin("gcm_ghash_clmul");
-	&mov		($Xip,&wparam(0));
-	&mov		($Htbl,&wparam(1));
-	&mov		($inp,&wparam(2));
-	&mov		($len,&wparam(3));
-
-	&picsetup($const);
-	&picsymbol($const, &label("bswap"), $const);
-
-	&movdqu		($Xi,&QWP(0,$Xip));
-	&movdqa		($T3,&QWP(0,$const));
-	&movdqu		($Hkey,&QWP(0,$Htbl));
-	&pshufb		($Xi,$T3);
-
-	&sub		($len,0x10);
-	&jz		(&label("odd_tail"));
-
-	#######
-	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
-	#	[(H*Ii+1) + (H*Xi+1)] mod P =
-	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
-	#
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
-	&pshufb		($T1,$T3);
-	&pshufb		($Xn,$T3);
-	&pxor		($Xi,$T1);		# Ii+Xi
-
-	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
-	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
-
-	&sub		($len,0x20);
-	&lea		($inp,&DWP(32,$inp));	# i+=2
-	&jbe		(&label("even_tail"));
-
-&set_label("mod_loop");
-	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
-	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
-
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
-
-	&reduction_alg5	($Xhi,$Xi);
-
-	#######
-	&movdqa		($T3,&QWP(0,$const));
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
-	&pshufb		($T1,$T3);
-	&pshufb		($Xn,$T3);
-	&pxor		($Xi,$T1);		# Ii+Xi
-
-	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
-	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
-
-	&sub		($len,0x20);
-	&lea		($inp,&DWP(32,$inp));
-	&ja		(&label("mod_loop"));
-
-&set_label("even_tail");
-	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
-
-	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
-	&pxor		($Xhi,$Xhn);
-
-	&reduction_alg5	($Xhi,$Xi);
-
-	&movdqa		($T3,&QWP(0,$const));
-	&test		($len,$len);
-	&jnz		(&label("done"));
-
-	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
-&set_label("odd_tail");
-	&movdqu		($T1,&QWP(0,$inp));	# Ii
-	&pshufb		($T1,$T3);
-	&pxor		($Xi,$T1);		# Ii+Xi
-
-	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
-	&reduction_alg5	($Xhi,$Xi);
-
-	&movdqa		($T3,&QWP(0,$const));
-&set_label("done");
-	&pshufb		($Xi,$T3);
-	&movdqu		(&QWP(0,$Xip),$Xi);
-&function_end("gcm_ghash_clmul");
-
-}
-
-	&rodataseg();
-&set_label("bswap",64);
-	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
-	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
-	&previous();
-}}	# $sse2
-
-	&rodataseg();
-&set_label("rem_4bit",64);
-	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
-	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
-	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
-	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
-&set_label("rem_8bit",64);
-	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
-	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
-	&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
-	&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
-	&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
-	&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
-	&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
-	&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
-	&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
-	&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
-	&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
-	&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
-	&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
-	&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
-	&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
-	&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
-	&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
-	&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
-	&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
-	&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
-	&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
-	&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
-	&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
-	&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
-	&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
-	&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
-	&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
-	&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
-	&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
-	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
-	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
-	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
-	&previous();
-}}}	# !$x86only
-
-&asm_finish();
-
-# A question was risen about choice of vanilla MMX. Or rather why wasn't
-# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
-# CPUs such as PIII, "4-bit" MMX version was observed to provide better
-# performance than *corresponding* SSE2 one even on contemporary CPUs.
-# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
-# implementation featuring full range of lookup-table sizes, but with
-# per-invocation lookup table setup. Latter means that table size is
-# chosen depending on how much data is to be hashed in every given call,
-# more data - larger table. Best reported result for Core2 is ~4 cycles
-# per processed byte out of 64KB block. This number accounts even for
-# 64KB table setup overhead. As discussed in gcm128.c we choose to be
-# more conservative in respect to lookup table sizes, but how do the
-# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
-# on same platform. As also discussed in gcm128.c, next in line "8-bit
-# Shoup's" or "4KB" method should deliver twice the performance of
-# "256B" one, in other words not worse than ~6 cycles per byte. It
-# should be also be noted that in SSE2 case improvement can be "super-
-# linear," i.e. more than twice, mostly because >>8 maps to single
-# instruction on SSE2 register. This is unlike "4-bit" case when >>4
-# maps to same amount of instructions in both MMX and SSE2 cases.
-# Bottom line is that switch to SSE2 is considered to be justifiable
-# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
deleted file mode 100644
index bf547a041b..0000000000
--- a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
+++ /dev/null
@@ -1,812 +0,0 @@
-#!/usr/bin/env perl
-#
-# ====================================================================
-# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
-# project. The module is, however, dual licensed under OpenSSL and
-# CRYPTOGAMS licenses depending on where you obtain it. For further
-# details see http://www.openssl.org/~appro/cryptogams/.
-# ====================================================================
-#
-# March, June 2010
-#
-# The module implements "4-bit" GCM GHASH function and underlying
-# single multiplication operation in GF(2^128). "4-bit" means that
-# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
-# function features so called "528B" variant utilizing additional
-# 256+16 bytes of per-key storage [+512 bytes shared table].
-# Performance results are for this streamed GHASH subroutine and are
-# expressed in cycles per processed byte, less is better:
-#
-#		gcc 3.4.x(*)	assembler
-#
-# P4		28.6		14.0		+100%
-# Opteron	19.3		7.7		+150%
-# Core2		17.8		8.1(**)		+120%
-#
-# (*)	comparison is not completely fair, because C results are
-#	for vanilla "256B" implementation, while assembler results
-#	are for "528B";-)
-# (**)	it's mystery [to me] why Core2 result is not same as for
-#	Opteron;
-
-# May 2010
-#
-# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
-# See ghash-x86.pl for background information and details about coding
-# techniques.
-#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
-
-$flavour = shift;
-$output  = shift;
-if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
-$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-open OUT,"| \"$^X\" $xlate $flavour $output";
-*STDOUT=*OUT;
-
-# common register layout
-$nlo="%rax";
-$nhi="%rbx";
-$Zlo="%r8";
-$Zhi="%r9";
-$tmp="%r10";
-$rem_4bit = "%r11";
-
-$Xi="%rdi";
-$Htbl="%rsi";
-
-# per-function register layout
-$cnt="%rcx";
-$rem="%rdx";
-
-sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/	or
-			$r =~ s/%[er]([sd]i)/%\1l/	or
-			$r =~ s/%[er](bp)/%\1l/		or
-			$r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
-
-sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
-  my $arg = pop;
-    $arg = "\$$arg" if ($arg*1 eq $arg);
-    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
-}
-
-{ my $N;
-  sub loop() {
-  my $inp = shift;
-
-	$N++;
-$code.=<<___;
-	xor	$nlo,$nlo
-	xor	$nhi,$nhi
-	mov	`&LB("$Zlo")`,`&LB("$nlo")`
-	mov	`&LB("$Zlo")`,`&LB("$nhi")`
-	shl	\$4,`&LB("$nlo")`
-	mov	\$14,$cnt
-	mov	8($Htbl,$nlo),$Zlo
-	mov	($Htbl,$nlo),$Zhi
-	and	\$0xf0,`&LB("$nhi")`
-	mov	$Zlo,$rem
-	jmp	.Loop$N
-
-.align	16
-.Loop$N:
-	shr	\$4,$Zlo
-	and	\$0xf,$rem
-	mov	$Zhi,$tmp
-	mov	($inp,$cnt),`&LB("$nlo")`
-	shr	\$4,$Zhi
-	xor	8($Htbl,$nhi),$Zlo
-	shl	\$60,$tmp
-	xor	($Htbl,$nhi),$Zhi
-	mov	`&LB("$nlo")`,`&LB("$nhi")`
-	xor	($rem_4bit,$rem,8),$Zhi
-	mov	$Zlo,$rem
-	shl	\$4,`&LB("$nlo")`
-	xor	$tmp,$Zlo
-	dec	$cnt
-	js	.Lbreak$N
-
-	shr	\$4,$Zlo
-	and	\$0xf,$rem
-	mov	$Zhi,$tmp
-	shr	\$4,$Zhi
-	xor	8($Htbl,$nlo),$Zlo
-	shl	\$60,$tmp
-	xor	($Htbl,$nlo),$Zhi
-	and	\$0xf0,`&LB("$nhi")`
-	xor	($rem_4bit,$rem,8),$Zhi
-	mov	$Zlo,$rem
-	xor	$tmp,$Zlo
-	jmp	.Loop$N
-
-.align	16
-.Lbreak$N:
-	shr	\$4,$Zlo
-	and	\$0xf,$rem
-	mov	$Zhi,$tmp
-	shr	\$4,$Zhi
-	xor	8($Htbl,$nlo),$Zlo
-	shl	\$60,$tmp
-	xor	($Htbl,$nlo),$Zhi
-	and	\$0xf0,`&LB("$nhi")`
-	xor	($rem_4bit,$rem,8),$Zhi
-	mov	$Zlo,$rem
-	xor	$tmp,$Zlo
-
-	shr	\$4,$Zlo
-	and	\$0xf,$rem
-	mov	$Zhi,$tmp
-	shr	\$4,$Zhi
-	xor	8($Htbl,$nhi),$Zlo
-	shl	\$60,$tmp
-	xor	($Htbl,$nhi),$Zhi
-	xor	$tmp,$Zlo
-	xor	($rem_4bit,$rem,8),$Zhi
-
-	bswap	$Zlo
-	bswap	$Zhi
-___
-}}
-
-$code=<<___;
-.text
-
-.globl	gcm_gmult_4bit
-.type	gcm_gmult_4bit,\@function,2
-.align	16
-gcm_gmult_4bit:
-	_CET_ENDBR
-	push	%rbx
-	push	%rbp		# %rbp and %r12 are pushed exclusively in
-	push	%r12		# order to reuse Win64 exception handler...
-.Lgmult_prologue:
-
-	movzb	15($Xi),$Zlo
-	lea	.Lrem_4bit(%rip),$rem_4bit
-___
-	&loop	($Xi);
-$code.=<<___;
-	mov	$Zlo,8($Xi)
-	mov	$Zhi,($Xi)
-
-	mov	16(%rsp),%rbx
-	lea	24(%rsp),%rsp
-.Lgmult_epilogue:
-	ret
-.size	gcm_gmult_4bit,.-gcm_gmult_4bit
-___
-
-# per-function register layout
-$inp="%rdx";
-$len="%rcx";
-$rem_8bit=$rem_4bit;
-
-$code.=<<___;
-.globl	gcm_ghash_4bit
-.type	gcm_ghash_4bit,\@function,4
-.align	16
-gcm_ghash_4bit:
-	_CET_ENDBR
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	sub	\$280,%rsp
-.Lghash_prologue:
-	mov	$inp,%r14		# reassign couple of args
-	mov	$len,%r15
-___
-{ my $inp="%r14";
-  my $dat="%edx";
-  my $len="%r15";
-  my @nhi=("%ebx","%ecx");
-  my @rem=("%r12","%r13");
-  my $Hshr4="%rbp";
-
-	&sub	($Htbl,-128);		# size optimization
-	&lea	($Hshr4,"16+128(%rsp)");
-	{ my @lo =($nlo,$nhi);
-          my @hi =($Zlo,$Zhi);
-
-	  &xor	($dat,$dat);
-	  for ($i=0,$j=-2;$i<18;$i++,$j++) {
-	    &mov	("$j(%rsp)",&LB($dat))		if ($i>1);
-	    &or		($lo[0],$tmp)			if ($i>1);
-	    &mov	(&LB($dat),&LB($lo[1]))		if ($i>0 && $i<17);
-	    &shr	($lo[1],4)			if ($i>0 && $i<17);
-	    &mov	($tmp,$hi[1])			if ($i>0 && $i<17);
-	    &shr	($hi[1],4)			if ($i>0 && $i<17);
-	    &mov	("8*$j($Hshr4)",$hi[0])		if ($i>1);
-	    &mov	($hi[0],"16*$i+0-128($Htbl)")	if ($i<16);
-	    &shl	(&LB($dat),4)			if ($i>0 && $i<17);
-	    &mov	("8*$j-128($Hshr4)",$lo[0])	if ($i>1);
-	    &mov	($lo[0],"16*$i+8-128($Htbl)")	if ($i<16);
-	    &shl	($tmp,60)			if ($i>0 && $i<17);
-
-	    push	(@lo,shift(@lo));
-	    push	(@hi,shift(@hi));
-	  }
-	}
-	&add	($Htbl,-128);
-	&mov	($Zlo,"8($Xi)");
-	&mov	($Zhi,"0($Xi)");
-	&add	($len,$inp);		# pointer to the end of data
-	&lea	($rem_8bit,".Lrem_8bit(%rip)");
-	&jmp	(".Louter_loop");
-
-$code.=".align	16\n.Louter_loop:\n";
-	&xor	($Zhi,"($inp)");
-	&mov	("%rdx","8($inp)");
-	&lea	($inp,"16($inp)");
-	&xor	("%rdx",$Zlo);
-	&mov	("($Xi)",$Zhi);
-	&mov	("8($Xi)","%rdx");
-	&shr	("%rdx",32);
-
-	&xor	($nlo,$nlo);
-	&rol	($dat,8);
-	&mov	(&LB($nlo),&LB($dat));
-	&movz	($nhi[0],&LB($dat));
-	&shl	(&LB($nlo),4);
-	&shr	($nhi[0],4);
-
-	for ($j=11,$i=0;$i<15;$i++) {
-	    &rol	($dat,8);
-	    &xor	($Zlo,"8($Htbl,$nlo)")			if ($i>0);
-	    &xor	($Zhi,"($Htbl,$nlo)")			if ($i>0);
-	    &mov	($Zlo,"8($Htbl,$nlo)")			if ($i==0);
-	    &mov	($Zhi,"($Htbl,$nlo)")			if ($i==0);
-
-	    &mov	(&LB($nlo),&LB($dat));
-	    &xor	($Zlo,$tmp)				if ($i>0);
-	    &movzw	($rem[1],"($rem_8bit,$rem[1],2)")	if ($i>0);
-
-	    &movz	($nhi[1],&LB($dat));
-	    &shl	(&LB($nlo),4);
-	    &movzb	($rem[0],"(%rsp,$nhi[0])");
-
-	    &shr	($nhi[1],4)				if ($i<14);
-	    &and	($nhi[1],0xf0)				if ($i==14);
-	    &shl	($rem[1],48)				if ($i>0);
-	    &xor	($rem[0],$Zlo);
-
-	    &mov	($tmp,$Zhi);
-	    &xor	($Zhi,$rem[1])				if ($i>0);
-	    &shr	($Zlo,8);
-
-	    &movz	($rem[0],&LB($rem[0]));
-	    &mov	($dat,"$j($Xi)")			if (--$j%4==0 && $j>=0);
-	    &shr	($Zhi,8);
-
-	    &xor	($Zlo,"-128($Hshr4,$nhi[0],8)");
-	    &shl	($tmp,56);
-	    &xor	($Zhi,"($Hshr4,$nhi[0],8)");
-
-	    unshift	(@nhi,pop(@nhi));		# "rotate" registers
-	    unshift	(@rem,pop(@rem));
-	}
-	&movzw	($rem[1],"($rem_8bit,$rem[1],2)");
-	&xor	($Zlo,"8($Htbl,$nlo)");
-	&xor	($Zhi,"($Htbl,$nlo)");
-
-	&shl	($rem[1],48);
-	&xor	($Zlo,$tmp);
-
-	&xor	($Zhi,$rem[1]);
-	&movz	($rem[0],&LB($Zlo));
-	&shr	($Zlo,4);
-
-	&mov	($tmp,$Zhi);
-	&shl	(&LB($rem[0]),4);
-	&shr	($Zhi,4);
-
-	&xor	($Zlo,"8($Htbl,$nhi[0])");
-	&movzw	($rem[0],"($rem_8bit,$rem[0],2)");
-	&shl	($tmp,60);
-
-	&xor	($Zhi,"($Htbl,$nhi[0])");
-	&xor	($Zlo,$tmp);
-	&shl	($rem[0],48);
-
-	&bswap	($Zlo);
-	&xor	($Zhi,$rem[0]);
-
-	&bswap	($Zhi);
-	&cmp	($inp,$len);
-	&jb	(".Louter_loop");
-}
-$code.=<<___;
-	mov	$Zlo,8($Xi)
-	mov	$Zhi,($Xi)
-
-	lea	280(%rsp),%rsi
-	mov	0(%rsi),%r15
-	mov	8(%rsi),%r14
-	mov	16(%rsi),%r13
-	mov	24(%rsi),%r12
-	mov	32(%rsi),%rbp
-	mov	40(%rsi),%rbx
-	lea	48(%rsi),%rsp
-.Lghash_epilogue:
-	ret
-.size	gcm_ghash_4bit,.-gcm_ghash_4bit
-___
-
-######################################################################
-# PCLMULQDQ version.
-
-@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
-		("%rdi","%rsi","%rdx","%rcx");	# Unix order
-
-($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
-($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
-
-sub clmul64x64_T2 {	# minimal register pressure
-my ($Xhi,$Xi,$Hkey,$modulo)=@_;
-
-$code.=<<___ if (!defined($modulo));
-	movdqa		$Xi,$Xhi		#
-	pshufd		\$0b01001110,$Xi,$T1
-	pshufd		\$0b01001110,$Hkey,$T2
-	pxor		$Xi,$T1			#
-	pxor		$Hkey,$T2
-___
-$code.=<<___;
-	pclmulqdq	\$0x00,$Hkey,$Xi	#######
-	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
-	pclmulqdq	\$0x00,$T2,$T1		#######
-	pxor		$Xi,$T1			#
-	pxor		$Xhi,$T1		#
-
-	movdqa		$T1,$T2			#
-	psrldq		\$8,$T1
-	pslldq		\$8,$T2			#
-	pxor		$T1,$Xhi
-	pxor		$T2,$Xi			#
-___
-}
-
-sub reduction_alg9 {	# 17/13 times faster than Intel version
-my ($Xhi,$Xi) = @_;
-
-$code.=<<___;
-	# 1st phase
-	movdqa		$Xi,$T1			#
-	psllq		\$1,$Xi
-	pxor		$T1,$Xi			#
-	psllq		\$5,$Xi			#
-	pxor		$T1,$Xi			#
-	psllq		\$57,$Xi		#
-	movdqa		$Xi,$T2			#
-	pslldq		\$8,$Xi
-	psrldq		\$8,$T2			#	
-	pxor		$T1,$Xi
-	pxor		$T2,$Xhi		#
-
-	# 2nd phase
-	movdqa		$Xi,$T2
-	psrlq		\$5,$Xi
-	pxor		$T2,$Xi			#
-	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
-	pxor		$Xhi,$T2
-	psrlq		\$1,$Xi			#
-	pxor		$T2,$Xi			#
-___
-}
-
-{ my ($Htbl,$Xip)=@_4args;
-
-$code.=<<___;
-.globl	gcm_init_clmul
-.type	gcm_init_clmul,\@abi-omnipotent
-.align	16
-gcm_init_clmul:
-	_CET_ENDBR
-	movdqu		($Xip),$Hkey
-	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
-
-	# <<1 twist
-	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
-	movdqa		$Hkey,$T1
-	psllq		\$1,$Hkey
-	pxor		$T3,$T3			#
-	psrlq		\$63,$T1
-	pcmpgtd		$T2,$T3			# broadcast carry bit
-	pslldq		\$8,$T1
-	por		$T1,$Hkey		# H<<=1
-
-	# magic reduction
-	pand		.L0x1c2_polynomial(%rip),$T3
-	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
-
-	# calculate H^2
-	movdqa		$Hkey,$Xi
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
-	&reduction_alg9	($Xhi,$Xi);
-$code.=<<___;
-	movdqu		$Hkey,($Htbl)		# save H
-	movdqu		$Xi,16($Htbl)		# save H^2
-	ret
-.size	gcm_init_clmul,.-gcm_init_clmul
-___
-}
-
-{ my ($Xip,$Htbl)=@_4args;
-
-$code.=<<___;
-.globl	gcm_gmult_clmul
-.type	gcm_gmult_clmul,\@abi-omnipotent
-.align	16
-gcm_gmult_clmul:
-	_CET_ENDBR
-	movdqu		($Xip),$Xi
-	movdqa		.Lbswap_mask(%rip),$T3
-	movdqu		($Htbl),$Hkey
-	pshufb		$T3,$Xi
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
-	&reduction_alg9	($Xhi,$Xi);
-$code.=<<___;
-	pshufb		$T3,$Xi
-	movdqu		$Xi,($Xip)
-	ret
-.size	gcm_gmult_clmul,.-gcm_gmult_clmul
-___
-}
-
-{ my ($Xip,$Htbl,$inp,$len)=@_4args;
-  my $Xn="%xmm6";
-  my $Xhn="%xmm7";
-  my $Hkey2="%xmm8";
-  my $T1n="%xmm9";
-  my $T2n="%xmm10";
-
-$code.=<<___;
-.globl	gcm_ghash_clmul
-.type	gcm_ghash_clmul,\@abi-omnipotent
-.align	16
-gcm_ghash_clmul:
-	_CET_ENDBR
-___
-$code.=<<___ if ($win64);
-.LSEH_begin_gcm_ghash_clmul:
-	# I can't trust assembler to use specific encoding:-(
-	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
-	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
-	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
-	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
-	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
-	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
-___
-$code.=<<___;
-	movdqa		.Lbswap_mask(%rip),$T3
-
-	movdqu		($Xip),$Xi
-	movdqu		($Htbl),$Hkey
-	pshufb		$T3,$Xi
-
-	sub		\$0x10,$len
-	jz		.Lodd_tail
-
-	movdqu		16($Htbl),$Hkey2
-	#######
-	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
-	#	[(H*Ii+1) + (H*Xi+1)] mod P =
-	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
-	#
-	movdqu		($inp),$T1		# Ii
-	movdqu		16($inp),$Xn		# Ii+1
-	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
-	pxor		$T1,$Xi			# Ii+Xi
-___
-	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
-$code.=<<___;
-	movdqa		$Xi,$Xhi		#
-	pshufd		\$0b01001110,$Xi,$T1
-	pshufd		\$0b01001110,$Hkey2,$T2
-	pxor		$Xi,$T1			#
-	pxor		$Hkey2,$T2
-
-	lea		32($inp),$inp		# i+=2
-	sub		\$0x20,$len
-	jbe		.Leven_tail
-
-.Lmod_loop:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	movdqu		($inp),$T1		# Ii
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
-	pxor		$Xhn,$Xhi
-
-	movdqu		16($inp),$Xn		# Ii+1
-	pshufb		$T3,$T1
-	pshufb		$T3,$Xn
-
-	movdqa		$Xn,$Xhn		#
-	pshufd		\$0b01001110,$Xn,$T1n
-	pshufd		\$0b01001110,$Hkey,$T2n
-	pxor		$Xn,$T1n		#
-	pxor		$Hkey,$T2n
-	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
-
-	  movdqa	$Xi,$T1			# 1st phase
-	  psllq		\$1,$Xi
-	  pxor		$T1,$Xi			#
-	  psllq		\$5,$Xi			#
-	  pxor		$T1,$Xi			#
-	pclmulqdq	\$0x00,$Hkey,$Xn	#######
-	  psllq		\$57,$Xi		#
-	  movdqa	$Xi,$T2			#
-	  pslldq	\$8,$Xi
-	  psrldq	\$8,$T2			#	
-	  pxor		$T1,$Xi
-	  pxor		$T2,$Xhi		#
-
-	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
-	  movdqa	$Xi,$T2			# 2nd phase
-	  psrlq		\$5,$Xi
-	  pxor		$T2,$Xi			#
-	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
-	  pxor		$Xhi,$T2
-	  psrlq		\$1,$Xi			#
-	  pxor		$T2,$Xi			#
-
-	pclmulqdq	\$0x00,$T2n,$T1n	#######
-	 movdqa		$Xi,$Xhi		#
-	 pshufd		\$0b01001110,$Xi,$T1
-	 pshufd		\$0b01001110,$Hkey2,$T2
-	 pxor		$Xi,$T1			#
-	 pxor		$Hkey2,$T2
-
-	pxor		$Xn,$T1n		#
-	pxor		$Xhn,$T1n		#
-	movdqa		$T1n,$T2n		#
-	psrldq		\$8,$T1n
-	pslldq		\$8,$T2n		#
-	pxor		$T1n,$Xhn
-	pxor		$T2n,$Xn		#
-
-	lea		32($inp),$inp
-	sub		\$0x20,$len
-	ja		.Lmod_loop
-
-.Leven_tail:
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
-$code.=<<___;
-	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
-	pxor		$Xhn,$Xhi
-___
-	&reduction_alg9	($Xhi,$Xi);
-$code.=<<___;
-	test		$len,$len
-	jnz		.Ldone
-
-.Lodd_tail:
-	movdqu		($inp),$T1		# Ii
-	pshufb		$T3,$T1
-	pxor		$T1,$Xi			# Ii+Xi
-___
-	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
-	&reduction_alg9	($Xhi,$Xi);
-$code.=<<___;
-.Ldone:
-	pshufb		$T3,$Xi
-	movdqu		$Xi,($Xip)
-___
-$code.=<<___ if ($win64);
-	movaps	(%rsp),%xmm6
-	movaps	0x10(%rsp),%xmm7
-	movaps	0x20(%rsp),%xmm8
-	movaps	0x30(%rsp),%xmm9
-	movaps	0x40(%rsp),%xmm10
-	add	\$0x58,%rsp
-___
-$code.=<<___;
-	ret
-.LSEH_end_gcm_ghash_clmul:
-.size	gcm_ghash_clmul,.-gcm_ghash_clmul
-___
-}
-
-$code.=<<___;
-.section .rodata
-.align	64
-.Lbswap_mask:
-	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
-.L0x1c2_polynomial:
-	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
-.align	64
-.type	.Lrem_4bit,\@object
-.Lrem_4bit:
-	.long	0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
-	.long	0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
-	.long	0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
-	.long	0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
-.type	.Lrem_8bit,\@object
-.Lrem_8bit:
-	.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
-	.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
-	.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
-	.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
-	.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
-	.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
-	.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
-	.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
-	.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
-	.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
-	.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
-	.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
-	.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
-	.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
-	.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
-	.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
-	.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
-	.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
-	.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
-	.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
-	.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
-	.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
-	.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
-	.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
-	.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
-	.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
-	.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
-	.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
-	.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
-	.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
-	.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
-	.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
-.align	64
-.text
-___
-
-# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
-#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
-$rec="%rcx";
-$frame="%rdx";
-$context="%r8";
-$disp="%r9";
-
-$code.=<<___;
-.extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
-.align	16
-se_handler:
-	_CET_ENDBR
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	120($context),%rax	# pull context->Rax
-	mov	248($context),%rbx	# pull context->Rip
-
-	mov	8($disp),%rsi		# disp->ImageBase
-	mov	56($disp),%r11		# disp->HandlerData
-
-	mov	0(%r11),%r10d		# HandlerData[0]
-	lea	(%rsi,%r10),%r10	# prologue label
-	cmp	%r10,%rbx		# context->Rip<prologue label
-	jb	.Lin_prologue
-
-	mov	152($context),%rax	# pull context->Rsp
-
-	mov	4(%r11),%r10d		# HandlerData[1]
-	lea	(%rsi,%r10),%r10	# epilogue label
-	cmp	%r10,%rbx		# context->Rip>=epilogue label
-	jae	.Lin_prologue
-
-	lea	24(%rax),%rax		# adjust "rsp"
-
-	mov	-8(%rax),%rbx
-	mov	-16(%rax),%rbp
-	mov	-24(%rax),%r12
-	mov	%rbx,144($context)	# restore context->Rbx
-	mov	%rbp,160($context)	# restore context->Rbp
-	mov	%r12,216($context)	# restore context->R12
-
-.Lin_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
-	mov	40($disp),%rdi		# disp->ContextRecord
-	mov	$context,%rsi		# context
-	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
-	.long	0xa548f3fc		# cld; rep movsq
-
-	mov	$disp,%rsi
-	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
-	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
-	mov	0(%rsi),%r8		# arg3, disp->ControlPc
-	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
-	mov	40(%rsi),%r10		# disp->ContextRecord
-	lea	56(%rsi),%r11		# &disp->HandlerData
-	lea	24(%rsi),%r12		# &disp->EstablisherFrame
-	mov	%r10,32(%rsp)		# arg5
-	mov	%r11,40(%rsp)		# arg6
-	mov	%r12,48(%rsp)		# arg7
-	mov	%rcx,56(%rsp)		# arg8, (NULL)
-	call	*__imp_RtlVirtualUnwind(%rip)
-
-	mov	\$1,%eax		# ExceptionContinueSearch
-	add	\$64,%rsp
-	popfq
-	pop	%r15
-	pop	%r14
-	pop	%r13
-	pop	%r12
-	pop	%rbp
-	pop	%rbx
-	pop	%rdi
-	pop	%rsi
-	ret
-.size	se_handler,.-se_handler
-
-.section	.pdata
-.align	4
-	.rva	.LSEH_begin_gcm_gmult_4bit
-	.rva	.LSEH_end_gcm_gmult_4bit
-	.rva	.LSEH_info_gcm_gmult_4bit
-
-	.rva	.LSEH_begin_gcm_ghash_4bit
-	.rva	.LSEH_end_gcm_ghash_4bit
-	.rva	.LSEH_info_gcm_ghash_4bit
-
-	.rva	.LSEH_begin_gcm_ghash_clmul
-	.rva	.LSEH_end_gcm_ghash_clmul
-	.rva	.LSEH_info_gcm_ghash_clmul
-
-.section	.xdata
-.align	8
-.LSEH_info_gcm_gmult_4bit:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lgmult_prologue,.Lgmult_epilogue	# HandlerData
-.LSEH_info_gcm_ghash_4bit:
-	.byte	9,0,0,0
-	.rva	se_handler
-	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
-.LSEH_info_gcm_ghash_clmul:
-	.byte	0x01,0x1f,0x0b,0x00
-	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
-	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
-	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
-	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
-	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
-	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
-___
-}
-
-$code =~ s/\`([^\`]*)\`/eval($1)/gem;
-
-print $code;
-
-close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
deleted file mode 100644
index f8ebf79a87..0000000000
--- a/src/lib/libcrypto/modes/cbc128.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/* $OpenBSD: cbc128.c,v 1.8 2023/07/08 14:56:54 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_local.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-
-#undef STRICT_ALIGNMENT
-#ifdef __STRICT_ALIGNMENT
-#define STRICT_ALIGNMENT 1
-#else
-#define STRICT_ALIGNMENT 0
-#endif
-
-void
-CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], block128_f block)
-{
-	size_t n;
-	const unsigned char *iv = ivec;
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (STRICT_ALIGNMENT &&
-	    ((size_t)in|(size_t)out|(size_t)ivec) % sizeof(size_t) != 0) {
-		while (len >= 16) {
-			for (n = 0; n < 16; ++n)
-				out[n] = in[n] ^ iv[n];
-			(*block)(out, out, key);
-			iv = out;
-			len -= 16;
-			in += 16;
-			out += 16;
-		}
-	} else {
-		while (len >= 16) {
-			for (n = 0; n < 16; n += sizeof(size_t))
-				*(size_t *)(out + n) =
-				    *(size_t *)(in + n) ^ *(size_t *)(iv + n);
-			(*block)(out, out, key);
-			iv = out;
-			len -= 16;
-			in += 16;
-			out += 16;
-		}
-	}
-#endif
-	while (len) {
-		for (n = 0; n < 16 && n < len; ++n)
-			out[n] = in[n] ^ iv[n];
-		for (; n < 16; ++n)
-			out[n] = iv[n];
-		(*block)(out, out, key);
-		iv = out;
-		if (len <= 16)
-			break;
-		len -= 16;
-		in += 16;
-		out += 16;
-	}
-	memmove(ivec, iv, 16);
-}
-LCRYPTO_ALIAS(CRYPTO_cbc128_encrypt);
-
-void
-CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], block128_f block)
-{
-	size_t n;
-	union {
-		size_t t[16/sizeof(size_t)];
-		unsigned char c[16];
-	} tmp;
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (in != out) {
-		const unsigned char *iv = ivec;
-
-		if (STRICT_ALIGNMENT &&
-		    ((size_t)in|(size_t)out|(size_t)ivec) % sizeof(size_t) !=
-		    0) {
-			while (len >= 16) {
-				(*block)(in, out, key);
-				for (n = 0; n < 16; ++n)
-					out[n] ^= iv[n];
-				iv = in;
-				len -= 16;
-				in += 16;
-				out += 16;
-			}
-		} else if (16 % sizeof(size_t) == 0) { /* always true */
-			while (len >= 16) {
-				size_t *out_t = (size_t *)out,
-				       *iv_t = (size_t *)iv;
-
-				(*block)(in, out, key);
-				for (n = 0; n < 16/sizeof(size_t); n++)
-					out_t[n] ^= iv_t[n];
-				iv = in;
-				len -= 16;
-				in += 16;
-				out += 16;
-			}
-		}
-		memmove(ivec, iv, 16);
-	} else {
-		if (STRICT_ALIGNMENT &&
-		    ((size_t)in|(size_t)out|(size_t)ivec) % sizeof(size_t) !=
-		    0) {
-			unsigned char c;
-			while (len >= 16) {
-				(*block)(in, tmp.c, key);
-				for (n = 0; n < 16; ++n) {
-					c = in[n];
-					out[n] = tmp.c[n] ^ ivec[n];
-					ivec[n] = c;
-				}
-				len -= 16;
-				in += 16;
-				out += 16;
-			}
-		} else if (16 % sizeof(size_t) == 0) { /* always true */
-			while (len >= 16) {
-				size_t c, *out_t = (size_t *)out,
-				       *ivec_t = (size_t *)ivec;
-				const size_t *in_t = (const size_t *)in;
-
-				(*block)(in, tmp.c, key);
-				for (n = 0; n < 16/sizeof(size_t); n++) {
-					c = in_t[n];
-					out_t[n] = tmp.t[n] ^ ivec_t[n];
-					ivec_t[n] = c;
-				}
-				len -= 16;
-				in += 16;
-				out += 16;
-			}
-		}
-	}
-#endif
-	while (len) {
-		unsigned char c;
-		(*block)(in, tmp.c, key);
-		for (n = 0; n < 16 && n < len; ++n) {
-			c = in[n];
-			out[n] = tmp.c[n] ^ ivec[n];
-			ivec[n] = c;
-		}
-		if (len <= 16) {
-			for (; n < 16; ++n)
-				ivec[n] = in[n];
-			break;
-		}
-		len -= 16;
-		in += 16;
-		out += 16;
-	}
-}
-LCRYPTO_ALIAS(CRYPTO_cbc128_decrypt);
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
deleted file mode 100644
index 68c5cce5da..0000000000
--- a/src/lib/libcrypto/modes/ccm128.c
+++ /dev/null
@@ -1,498 +0,0 @@
-/* $OpenBSD: ccm128.c,v 1.8 2023/07/08 14:56:54 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-
-#include <openssl/crypto.h>
-#include "modes_local.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-
-/* First you setup M and L parameters and pass the key schedule.
- * This is called once per session setup... */
-void
-CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
-    unsigned int M, unsigned int L, void *key, block128_f block)
-{
-	memset(ctx->nonce.c, 0, sizeof(ctx->nonce.c));
-	ctx->nonce.c[0] = ((u8)(L - 1) & 7) | (u8)(((M - 2)/2) & 7) << 3;
-	ctx->blocks = 0;
-	ctx->block = block;
-	ctx->key = key;
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_init);
-
-/* !!! Following interfaces are to be called *once* per packet !!! */
-
-/* Then you setup per-message nonce and pass the length of the message */
-int
-CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
-    const unsigned char *nonce, size_t nlen, size_t mlen)
-{
-	unsigned int L = ctx->nonce.c[0] & 7;	/* the L parameter */
-
-	if (nlen < (14 - L))
-		return -1;		/* nonce is too short */
-
-	if (sizeof(mlen) == 8 && L >= 3) {
-		ctx->nonce.c[8] = (u8)(mlen >> (56 % (sizeof(mlen)*8)));
-		ctx->nonce.c[9] = (u8)(mlen >> (48 % (sizeof(mlen)*8)));
-		ctx->nonce.c[10] = (u8)(mlen >> (40 % (sizeof(mlen)*8)));
-		ctx->nonce.c[11] = (u8)(mlen >> (32 % (sizeof(mlen)*8)));
-	} else
-		ctx->nonce.u[1] = 0;
-
-	ctx->nonce.c[12] = (u8)(mlen >> 24);
-	ctx->nonce.c[13] = (u8)(mlen >> 16);
-	ctx->nonce.c[14] = (u8)(mlen >> 8);
-	ctx->nonce.c[15] = (u8)mlen;
-
-	ctx->nonce.c[0] &= ~0x40;	/* clear Adata flag */
-	memcpy(&ctx->nonce.c[1], nonce, 14 - L);
-
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_setiv);
-
-/* Then you pass additional authentication data, this is optional */
-void
-CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
-    const unsigned char *aad, size_t alen)
-{
-	unsigned int i;
-	block128_f block = ctx->block;
-
-	if (alen == 0)
-		return;
-
-	ctx->nonce.c[0] |= 0x40;	/* set Adata flag */
-	(*block)(ctx->nonce.c, ctx->cmac.c, ctx->key),
-	    ctx->blocks++;
-
-	if (alen < (0x10000 - 0x100)) {
-		ctx->cmac.c[0] ^= (u8)(alen >> 8);
-		ctx->cmac.c[1] ^= (u8)alen;
-		i = 2;
-	} else if (sizeof(alen) == 8 &&
-	    alen >= (size_t)1 << (32 % (sizeof(alen)*8))) {
-		ctx->cmac.c[0] ^= 0xFF;
-		ctx->cmac.c[1] ^= 0xFF;
-		ctx->cmac.c[2] ^= (u8)(alen >> (56 % (sizeof(alen)*8)));
-		ctx->cmac.c[3] ^= (u8)(alen >> (48 % (sizeof(alen)*8)));
-		ctx->cmac.c[4] ^= (u8)(alen >> (40 % (sizeof(alen)*8)));
-		ctx->cmac.c[5] ^= (u8)(alen >> (32 % (sizeof(alen)*8)));
-		ctx->cmac.c[6] ^= (u8)(alen >> 24);
-		ctx->cmac.c[7] ^= (u8)(alen >> 16);
-		ctx->cmac.c[8] ^= (u8)(alen >> 8);
-		ctx->cmac.c[9] ^= (u8)alen;
-		i = 10;
-	} else {
-		ctx->cmac.c[0] ^= 0xFF;
-		ctx->cmac.c[1] ^= 0xFE;
-		ctx->cmac.c[2] ^= (u8)(alen >> 24);
-		ctx->cmac.c[3] ^= (u8)(alen >> 16);
-		ctx->cmac.c[4] ^= (u8)(alen >> 8);
-		ctx->cmac.c[5] ^= (u8)alen;
-		i = 6;
-	}
-
-	do {
-		for (; i < 16 && alen; ++i, ++aad, --alen)
-			ctx->cmac.c[i] ^= *aad;
-		(*block)(ctx->cmac.c, ctx->cmac.c, ctx->key),
-		    ctx->blocks++;
-		i = 0;
-	} while (alen);
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_aad);
-
-/* Finally you encrypt or decrypt the message */
-
-/* counter part of nonce may not be larger than L*8 bits,
- * L is not larger than 8, therefore 64-bit counter... */
-static void
-ctr64_inc(unsigned char *counter)
-{
-	unsigned int n = 8;
-	u8 c;
-
-	counter += 8;
-	do {
-		--n;
-		c = counter[n];
-		++c;
-		counter[n] = c;
-		if (c)
-			return;
-	} while (n);
-}
-
-int
-CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out,
-    size_t len)
-{
-	size_t		 n;
-	unsigned int	 i, L;
-	unsigned char	 flags0 = ctx->nonce.c[0];
-	block128_f	 block = ctx->block;
-	void		*key = ctx->key;
-	union {
-		u64 u[2];
-		u8 c[16];
-	} scratch;
-
-	if (!(flags0 & 0x40))
-		(*block)(ctx->nonce.c, ctx->cmac.c, key),
-		    ctx->blocks++;
-
-	ctx->nonce.c[0] = L = flags0 & 7;
-	for (n = 0, i = 15 - L; i < 15; ++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i] = 0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15] = 1;
-
-	if (n != len)
-		return -1;	/* length mismatch */
-
-	ctx->blocks += ((len + 15) >> 3)|1;
-	if (ctx->blocks > (U64(1) << 61))
-		return -2; /* too much data */
-
-	while (len >= 16) {
-#ifdef __STRICT_ALIGNMENT
-		union {
-			u64 u[2];
-			u8 c[16];
-		} temp;
-
-		memcpy(temp.c, inp, 16);
-		ctx->cmac.u[0] ^= temp.u[0];
-		ctx->cmac.u[1] ^= temp.u[1];
-#else
-		ctx->cmac.u[0] ^= ((u64 *)inp)[0];
-		ctx->cmac.u[1] ^= ((u64 *)inp)[1];
-#endif
-		(*block)(ctx->cmac.c, ctx->cmac.c, key);
-		(*block)(ctx->nonce.c, scratch.c, key);
-		ctr64_inc(ctx->nonce.c);
-#ifdef __STRICT_ALIGNMENT
-		temp.u[0] ^= scratch.u[0];
-		temp.u[1] ^= scratch.u[1];
-		memcpy(out, temp.c, 16);
-#else
-		((u64 *)out)[0] = scratch.u[0] ^ ((u64 *)inp)[0];
-		((u64 *)out)[1] = scratch.u[1] ^ ((u64 *)inp)[1];
-#endif
-		inp += 16;
-		out += 16;
-		len -= 16;
-	}
-
-	if (len) {
-		for (i = 0; i < len; ++i)
-			ctx->cmac.c[i] ^= inp[i];
-		(*block)(ctx->cmac.c, ctx->cmac.c, key);
-		(*block)(ctx->nonce.c, scratch.c, key);
-		for (i = 0; i < len; ++i)
-			out[i] = scratch.c[i] ^ inp[i];
-	}
-
-	for (i = 15 - L; i < 16; ++i)
-		ctx->nonce.c[i] = 0;
-
-	(*block)(ctx->nonce.c, scratch.c, key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
-
-	ctx->nonce.c[0] = flags0;
-
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_encrypt);
-
-int
-CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out,
-    size_t len)
-{
-	size_t		 n;
-	unsigned int	 i, L;
-	unsigned char	 flags0 = ctx->nonce.c[0];
-	block128_f	 block = ctx->block;
-	void		*key = ctx->key;
-	union {
-		u64 u[2];
-		u8 c[16];
-	} scratch;
-
-	if (!(flags0 & 0x40))
-		(*block)(ctx->nonce.c, ctx->cmac.c, key);
-
-	ctx->nonce.c[0] = L = flags0 & 7;
-	for (n = 0, i = 15 - L; i < 15; ++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i] = 0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15] = 1;
-
-	if (n != len)
-		return -1;
-
-	while (len >= 16) {
-#ifdef __STRICT_ALIGNMENT
-		union {
-			u64 u[2];
-			u8 c[16];
-		} temp;
-#endif
-		(*block)(ctx->nonce.c, scratch.c, key);
-		ctr64_inc(ctx->nonce.c);
-#ifdef __STRICT_ALIGNMENT
-		memcpy(temp.c, inp, 16);
-		ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
-		ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
-		memcpy(out, scratch.c, 16);
-#else
-		ctx->cmac.u[0] ^= (((u64 *)out)[0] = scratch.u[0] ^
-		    ((u64 *)inp)[0]);
-		ctx->cmac.u[1] ^= (((u64 *)out)[1] = scratch.u[1] ^
-		    ((u64 *)inp)[1]);
-#endif
-		(*block)(ctx->cmac.c, ctx->cmac.c, key);
-
-		inp += 16;
-		out += 16;
-		len -= 16;
-	}
-
-	if (len) {
-		(*block)(ctx->nonce.c, scratch.c, key);
-		for (i = 0; i < len; ++i)
-			ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
-		(*block)(ctx->cmac.c, ctx->cmac.c, key);
-	}
-
-	for (i = 15 - L; i < 16; ++i)
-		ctx->nonce.c[i] = 0;
-
-	(*block)(ctx->nonce.c, scratch.c, key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
-
-	ctx->nonce.c[0] = flags0;
-
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_decrypt);
-
-static void
-ctr64_add(unsigned char *counter, size_t inc)
-{
-	size_t n = 8, val = 0;
-
-	counter += 8;
-	do {
-		--n;
-		val += counter[n] + (inc & 0xff);
-		counter[n] = (unsigned char)val;
-		val >>= 8;	/* carry bit */
-		inc >>= 8;
-	} while (n && (inc || val));
-}
-
-int
-CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out,
-    size_t len, ccm128_f stream)
-{
-	size_t		 n;
-	unsigned int	 i, L;
-	unsigned char	 flags0 = ctx->nonce.c[0];
-	block128_f	 block = ctx->block;
-	void		*key = ctx->key;
-	union {
-		u64 u[2];
-		u8 c[16];
-	} scratch;
-
-	if (!(flags0 & 0x40))
-		(*block)(ctx->nonce.c, ctx->cmac.c, key),
-		    ctx->blocks++;
-
-	ctx->nonce.c[0] = L = flags0 & 7;
-	for (n = 0, i = 15 - L; i < 15; ++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i] = 0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15] = 1;
-
-	if (n != len)
-		return -1;	/* length mismatch */
-
-	ctx->blocks += ((len + 15) >> 3)|1;
-	if (ctx->blocks > (U64(1) << 61))
-		return -2; /* too much data */
-
-	if ((n = len/16)) {
-		(*stream)(inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
-		n *= 16;
-		inp += n;
-		out += n;
-		len -= n;
-		if (len)
-			ctr64_add(ctx->nonce.c, n/16);
-	}
-
-	if (len) {
-		for (i = 0; i < len; ++i)
-			ctx->cmac.c[i] ^= inp[i];
-		(*block)(ctx->cmac.c, ctx->cmac.c, key);
-		(*block)(ctx->nonce.c, scratch.c, key);
-		for (i = 0; i < len; ++i)
-			out[i] = scratch.c[i] ^ inp[i];
-	}
-
-	for (i = 15 - L; i < 16; ++i)
-		ctx->nonce.c[i] = 0;
-
-	(*block)(ctx->nonce.c, scratch.c, key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
-
-	ctx->nonce.c[0] = flags0;
-
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_encrypt_ccm64);
-
-int
-CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out,
-    size_t len, ccm128_f stream)
-{
-	size_t		 n;
-	unsigned int	 i, L;
-	unsigned char	 flags0 = ctx->nonce.c[0];
-	block128_f	 block = ctx->block;
-	void		*key = ctx->key;
-	union {
-		u64 u[2];
-		u8 c[16];
-	} scratch;
-
-	if (!(flags0 & 0x40))
-		(*block)(ctx->nonce.c, ctx->cmac.c, key);
-
-	ctx->nonce.c[0] = L = flags0 & 7;
-	for (n = 0, i = 15 - L; i < 15; ++i) {
-		n |= ctx->nonce.c[i];
-		ctx->nonce.c[i] = 0;
-		n <<= 8;
-	}
-	n |= ctx->nonce.c[15];	/* reconstructed length */
-	ctx->nonce.c[15] = 1;
-
-	if (n != len)
-		return -1;
-
-	if ((n = len/16)) {
-		(*stream)(inp, out, n, key, ctx->nonce.c, ctx->cmac.c);
-		n *= 16;
-		inp += n;
-		out += n;
-		len -= n;
-		if (len)
-			ctr64_add(ctx->nonce.c, n/16);
-	}
-
-	if (len) {
-		(*block)(ctx->nonce.c, scratch.c, key);
-		for (i = 0; i < len; ++i)
-			ctx->cmac.c[i] ^= (out[i] = scratch.c[i] ^ inp[i]);
-		(*block)(ctx->cmac.c, ctx->cmac.c, key);
-	}
-
-	for (i = 15 - L; i < 16; ++i)
-		ctx->nonce.c[i] = 0;
-
-	(*block)(ctx->nonce.c, scratch.c, key);
-	ctx->cmac.u[0] ^= scratch.u[0];
-	ctx->cmac.u[1] ^= scratch.u[1];
-
-	ctx->nonce.c[0] = flags0;
-
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_decrypt_ccm64);
-
-size_t
-CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
-{
-	unsigned int M = (ctx->nonce.c[0] >> 3) & 7;	/* the M parameter */
-
-	M *= 2;
-	M += 2;
-	if (len != M)
-		return 0;
-	memcpy(tag, ctx->cmac.c, M);
-	return M;
-}
-LCRYPTO_ALIAS(CRYPTO_ccm128_tag);
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
deleted file mode 100644
index 931353a620..0000000000
--- a/src/lib/libcrypto/modes/cfb128.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/* $OpenBSD: cfb128.c,v 1.7 2023/07/08 14:56:54 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_local.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-
-/* The input and output encrypted as though 128bit cfb mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
- */
-void
-CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], int *num,
-    int enc, block128_f block)
-{
-	unsigned int n;
-	size_t l = 0;
-
-	n = *num;
-
-	if (enc) {
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-		if (16 % sizeof(size_t) == 0)
-			do {	/* always true actually */
-				while (n && len) {
-					*(out++) = ivec[n] ^= *(in++);
-					--len;
-					n = (n + 1) % 16;
-				}
-#ifdef __STRICT_ALIGNMENT
-				if (((size_t)in|(size_t)out|(size_t)ivec) %
-				    sizeof(size_t) != 0)
-					break;
-#endif
-				while (len >= 16) {
-					(*block)(ivec, ivec, key);
-					for (; n < 16; n += sizeof(size_t)) {
-						*(size_t *)(out + n) =
-						    *(size_t *)(ivec + n) ^= *(size_t *)(in +
-						    n);
-					}
-					len -= 16;
-					out += 16;
-					in += 16;
-					n = 0;
-				}
-				if (len) {
-					(*block)(ivec, ivec, key);
-					while (len--) {
-						out[n] = ivec[n] ^= in[n];
-						++n;
-					}
-				}
-				*num = n;
-				return;
-			} while (0);
-	/* the rest would be commonly eliminated by x86* compiler */
-#endif
-		while (l < len) {
-			if (n == 0) {
-				(*block)(ivec, ivec, key);
-			}
-			out[l] = ivec[n] ^= in[l];
-			++l;
-			n = (n + 1) % 16;
-		}
-		*num = n;
-	} else {
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-		if (16 % sizeof(size_t) == 0)
-			do {	/* always true actually */
-				while (n && len) {
-					unsigned char c;
-					*(out++) = ivec[n] ^ (c = *(in++));
-					ivec[n] = c;
-					--len;
-					n = (n + 1) % 16;
-				}
-#ifdef __STRICT_ALIGNMENT
-				if (((size_t)in|(size_t)out|(size_t)ivec) %
-				    sizeof(size_t) != 0)
-					break;
-#endif
-				while (len >= 16) {
-					(*block)(ivec, ivec, key);
-					for (; n < 16; n += sizeof(size_t)) {
-						size_t t = *(size_t *)(in + n);
-						*(size_t *)(out + n) = *(size_t *)(ivec +
-						    n) ^ t;
-						*(size_t *)(ivec + n) = t;
-					}
-					len -= 16;
-					out += 16;
-					in += 16;
-					n = 0;
-				}
-				if (len) {
-					(*block)(ivec, ivec, key);
-					while (len--) {
-						unsigned char c;
-						out[n] = ivec[n] ^ (c = in[n]);
-						ivec[n] = c;
-						++n;
-					}
-				}
-				*num = n;
-				return;
-			} while (0);
-	/* the rest would be commonly eliminated by x86* compiler */
-#endif
-		while (l < len) {
-			unsigned char c;
-			if (n == 0) {
-				(*block)(ivec, ivec, key);
-			}
-			out[l] = ivec[n] ^ (c = in[l]);
-			ivec[n] = c;
-			++l;
-			n = (n + 1) % 16;
-		}
-		*num = n;
-	}
-}
-LCRYPTO_ALIAS(CRYPTO_cfb128_encrypt);
-
-/* This expects a single block of size nbits for both in and out. Note that
-   it corrupts any extra bits in the last byte of out */
-static void
-cfbr_encrypt_block(const unsigned char *in, unsigned char *out,
-    int nbits, const void *key,
-    unsigned char ivec[16], int enc,
-    block128_f block)
-{
-	int n, rem, num;
-	unsigned char ovec[16*2 + 1];  /* +1 because we dererefence (but don't use) one byte off the end */
-
-	if (nbits <= 0 || nbits > 128)
-		return;
-
-	/* fill in the first half of the new IV with the current IV */
-	memcpy(ovec, ivec, 16);
-	/* construct the new IV */
-	(*block)(ivec, ivec, key);
-	num = (nbits + 7)/8;
-	if (enc)	/* encrypt the input */
-		for (n = 0; n < num; ++n)
-			out[n] = (ovec[16 + n] = in[n] ^ ivec[n]);
-	else		/* decrypt the input */
-		for (n = 0; n < num; ++n)
-			out[n] = (ovec[16 + n] = in[n]) ^ ivec[n];
-	/* shift ovec left... */
-	rem = nbits % 8;
-	num = nbits/8;
-	if (rem == 0)
-		memcpy(ivec, ovec + num, 16);
-	else
-		for (n = 0; n < 16; ++n)
-			ivec[n] = ovec[n + num] << rem |
-			    ovec[n + num + 1] >> (8 - rem);
-
-    /* it is not necessary to cleanse ovec, since the IV is not secret */
-}
-
-/* N.B. This expects the input to be packed, MS bit first */
-void
-CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
-    size_t bits, const void *key,
-    unsigned char ivec[16], int *num,
-    int enc, block128_f block)
-{
-	size_t n;
-	unsigned char c[1], d[1];
-
-	for (n = 0; n < bits; ++n)
-	{
-		c[0] = (in[n/8] & (1 << (7 - n % 8))) ? 0x80 : 0;
-		cfbr_encrypt_block(c, d, 1, key, ivec, enc, block);
-		out[n/8] = (out[n/8] & ~(1 << (unsigned int)(7 - n % 8))) |
-		    ((d[0] & 0x80) >> (unsigned int)(n % 8));
-	}
-}
-LCRYPTO_ALIAS(CRYPTO_cfb128_1_encrypt);
-
-void
-CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
-    size_t length, const void *key,
-    unsigned char ivec[16], int *num,
-    int enc, block128_f block)
-{
-	size_t n;
-
-	for (n = 0; n < length; ++n)
-		cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block);
-}
-LCRYPTO_ALIAS(CRYPTO_cfb128_8_encrypt);
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
deleted file mode 100644
index 6d507dfc3a..0000000000
--- a/src/lib/libcrypto/modes/ctr128.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/* $OpenBSD: ctr128.c,v 1.11 2023/07/08 14:56:54 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_local.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-#include <assert.h>
-
-/* NOTE: the IV/counter CTR mode is big-endian.  The code itself
- * is endian-neutral. */
-
-/* increment counter (128-bit int) by 1 */
-static void
-ctr128_inc(unsigned char *counter)
-{
-	u32 n = 16;
-	u8  c;
-
-	do {
-		--n;
-		c = counter[n];
-		++c;
-		counter[n] = c;
-		if (c)
-			return;
-	} while (n);
-}
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-static void
-ctr128_inc_aligned(unsigned char *counter)
-{
-#if BYTE_ORDER == LITTLE_ENDIAN
-	ctr128_inc(counter);
-#else
-	size_t *data, c, n;
-	data = (size_t *)counter;
-	n = 16 / sizeof(size_t);
-	do {
-		--n;
-		c = data[n];
-		++c;
-		data[n] = c;
-		if (c)
-			return;
-	} while (n);
-#endif
-}
-#endif
-
-/* The input encrypted as though 128bit counter mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num, and the
- * encrypted counter is kept in ecount_buf.  Both *num and
- * ecount_buf must be initialised with zeros before the first
- * call to CRYPTO_ctr128_encrypt().
- *
- * This algorithm assumes that the counter is in the x lower bits
- * of the IV (ivec), and that the application has full control over
- * overflow and the rest of the IV.  This implementation takes NO
- * responsibility for checking that the counter doesn't overflow
- * into the rest of the IV when incremented.
- */
-void
-CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], unsigned char ecount_buf[16],
-    unsigned int *num, block128_f block)
-{
-	unsigned int n;
-	size_t l = 0;
-
-	assert(*num < 16);
-
-	n = *num;
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16 % sizeof(size_t) == 0)
-		do { /* always true actually */
-			while (n && len) {
-				*(out++) = *(in++) ^ ecount_buf[n];
-				--len;
-				n = (n + 1) % 16;
-			}
-
-#ifdef __STRICT_ALIGNMENT
-			if (((size_t)in|(size_t)out|(size_t)ivec) %
-			    sizeof(size_t) != 0)
-				break;
-#endif
-			while (len >= 16) {
-				(*block)(ivec, ecount_buf, key);
-				ctr128_inc_aligned(ivec);
-				for (; n < 16; n += sizeof(size_t))
-					*(size_t *)(out + n) =
-					    *(size_t *)(in + n) ^ *(size_t *)(ecount_buf +
-					    n);
-				len -= 16;
-				out += 16;
-				in += 16;
-				n = 0;
-			}
-			if (len) {
-				(*block)(ivec, ecount_buf, key);
-				ctr128_inc_aligned(ivec);
-				while (len--) {
-					out[n] = in[n] ^ ecount_buf[n];
-					++n;
-				}
-			}
-			*num = n;
-			return;
-		} while (0);
-	/* the rest would be commonly eliminated by x86* compiler */
-#endif
-	while (l < len) {
-		if (n == 0) {
-			(*block)(ivec, ecount_buf, key);
-			ctr128_inc(ivec);
-		}
-		out[l] = in[l] ^ ecount_buf[n];
-		++l;
-		n = (n + 1) % 16;
-	}
-
-	*num = n;
-}
-LCRYPTO_ALIAS(CRYPTO_ctr128_encrypt);
-
-/* increment upper 96 bits of 128-bit counter by 1 */
-static void
-ctr96_inc(unsigned char *counter)
-{
-	u32 n = 12;
-	u8  c;
-
-	do {
-		--n;
-		c = counter[n];
-		++c;
-		counter[n] = c;
-		if (c)
-			return;
-	} while (n);
-}
-
-void
-CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], unsigned char ecount_buf[16],
-    unsigned int *num, ctr128_f func)
-{
-	unsigned int n, ctr32;
-
-	assert(*num < 16);
-
-	n = *num;
-
-	while (n && len) {
-		*(out++) = *(in++) ^ ecount_buf[n];
-		--len;
-		n = (n + 1) % 16;
-	}
-
-	ctr32 = GETU32(ivec + 12);
-	while (len >= 16) {
-		size_t blocks = len/16;
-		/*
-		 * 1<<28 is just a not-so-small yet not-so-large number...
-		 * Below condition is practically never met, but it has to
-		 * be checked for code correctness.
-		 */
-		if (sizeof(size_t) > sizeof(unsigned int) &&
-		    blocks > (1U << 28))
-			blocks = (1U << 28);
-		/*
-		 * As (*func) operates on 32-bit counter, caller
-		 * has to handle overflow. 'if' below detects the
-		 * overflow, which is then handled by limiting the
-		 * amount of blocks to the exact overflow point...
-		 */
-		ctr32 += (u32)blocks;
-		if (ctr32 < blocks) {
-			blocks -= ctr32;
-			ctr32 = 0;
-		}
-		(*func)(in, out, blocks, key, ivec);
-		/* (*ctr) does not update ivec, caller does: */
-		PUTU32(ivec + 12, ctr32);
-		/* ... overflow was detected, propagate carry. */
-		if (ctr32 == 0)
-			ctr96_inc(ivec);
-		blocks *= 16;
-		len -= blocks;
-		out += blocks;
-		in += blocks;
-	}
-	if (len) {
-		memset(ecount_buf, 0, 16);
-		(*func)(ecount_buf, ecount_buf, 1, key, ivec);
-		++ctr32;
-		PUTU32(ivec + 12, ctr32);
-		if (ctr32 == 0)
-			ctr96_inc(ivec);
-		while (len--) {
-			out[n] = in[n] ^ ecount_buf[n];
-			++n;
-		}
-	}
-
-	*num = n;
-}
-LCRYPTO_ALIAS(CRYPTO_ctr128_encrypt_ctr32);
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
deleted file mode 100644
index 6c89bd44b7..0000000000
--- a/src/lib/libcrypto/modes/gcm128.c
+++ /dev/null
@@ -1,1358 +0,0 @@
-/* $OpenBSD: gcm128.c,v 1.27 2024/09/06 09:57:32 tb Exp $ */
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-
-#define OPENSSL_FIPSAPI
-
-#include <string.h>
-
-#include <openssl/crypto.h>
-
-#include "crypto_internal.h"
-#include "modes_local.h"
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-
-#if defined(BSWAP4) && defined(__STRICT_ALIGNMENT)
-/* redefine, because alignment is ensured */
-#undef	GETU32
-#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
-#endif
-
-#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
-#define REDUCE1BIT(V)							\
-	do {								\
-		if (sizeof(size_t)==8) {				\
-			u64 T = U64(0xe100000000000000) & (0-(V.lo&1));	\
-			V.lo  = (V.hi<<63)|(V.lo>>1);			\
-			V.hi  = (V.hi>>1 )^T;				\
-		} else {						\
-			u32 T = 0xe1000000U & (0-(u32)(V.lo&1));	\
-			V.lo  = (V.hi<<63)|(V.lo>>1);			\
-			V.hi  = (V.hi>>1 )^((u64)T<<32);		\
-		}							\
-	} while(0)
-
-/*
- * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
- * never be set to 8. 8 is effectively reserved for testing purposes.
- * TABLE_BITS>1 are lookup-table-driven implementations referred to as
- * "Shoup's" in GCM specification. In other words OpenSSL does not cover
- * whole spectrum of possible table driven implementations. Why? In
- * non-"Shoup's" case memory access pattern is segmented in such manner,
- * that it's trivial to see that cache timing information can reveal
- * fair portion of intermediate hash value. Given that ciphertext is
- * always available to attacker, it's possible for him to attempt to
- * deduce secret parameter H and if successful, tamper with messages
- * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
- * not as trivial, but there is no reason to believe that it's resistant
- * to cache-timing attack. And the thing about "8-bit" implementation is
- * that it consumes 16 (sixteen) times more memory, 4KB per individual
- * key + 1KB shared. Well, on pros side it should be twice as fast as
- * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
- * was observed to run ~75% faster, closer to 100% for commercial
- * compilers... Yet "4-bit" procedure is preferred, because it's
- * believed to provide better security-performance balance and adequate
- * all-round performance. "All-round" refers to things like:
- *
- * - shorter setup time effectively improves overall timing for
- *   handling short messages;
- * - larger table allocation can become unbearable because of VM
- *   subsystem penalties (for example on Windows large enough free
- *   results in VM working set trimming, meaning that consequent
- *   malloc would immediately incur working set expansion);
- * - larger table has larger cache footprint, which can affect
- *   performance of other code paths (not necessarily even from same
- *   thread in Hyper-Threading world);
- *
- * Value of 1 is not appropriate for performance reasons.
- */
-#if	TABLE_BITS==8
-
-static void
-gcm_init_8bit(u128 Htable[256], u64 H[2])
-{
-	int  i, j;
-	u128 V;
-
-	Htable[0].hi = 0;
-	Htable[0].lo = 0;
-	V.hi = H[0];
-	V.lo = H[1];
-
-	for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
-		REDUCE1BIT(V);
-		Htable[i] = V;
-	}
-
-	for (i = 2; i < 256; i <<= 1) {
-		u128 *Hi = Htable + i, H0 = *Hi;
-		for (j = 1; j < i; ++j) {
-			Hi[j].hi = H0.hi ^ Htable[j].hi;
-			Hi[j].lo = H0.lo ^ Htable[j].lo;
-		}
-	}
-}
-
-static void
-gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
-{
-	u128 Z = { 0, 0};
-	const u8 *xi = (const u8 *)Xi + 15;
-	size_t rem, n = *xi;
-	static const size_t rem_8bit[256] = {
-		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
-		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
-		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
-		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
-		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
-		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
-		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
-		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
-		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
-		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
-		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
-		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
-		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
-		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
-		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
-		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
-		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
-		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
-		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
-		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
-		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
-		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
-		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
-		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
-		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
-		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
-		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
-		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
-		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
-		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
-		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
-		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
-		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
-		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
-		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
-		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
-		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
-		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
-		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
-		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
-		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
-		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
-		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
-		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
-		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
-		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
-		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
-		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
-		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
-		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
-		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
-		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
-		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
-		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
-		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
-		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
-		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
-		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
-		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
-		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
-		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
-		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
-		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
-		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
-
-	while (1) {
-		Z.hi ^= Htable[n].hi;
-		Z.lo ^= Htable[n].lo;
-
-		if ((u8 *)Xi == xi)
-			break;
-
-		n = *(--xi);
-
-		rem = (size_t)Z.lo & 0xff;
-		Z.lo = (Z.hi << 56)|(Z.lo >> 8);
-		Z.hi = (Z.hi >> 8);
-#if SIZE_MAX == 0xffffffffffffffff
-		Z.hi ^= rem_8bit[rem];
-#else
-		Z.hi ^= (u64)rem_8bit[rem] << 32;
-#endif
-	}
-
-	Xi[0] = htobe64(Z.hi);
-	Xi[1] = htobe64(Z.lo);
-}
-#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
-
-#elif	TABLE_BITS==4
-
-static void
-gcm_init_4bit(u128 Htable[16], u64 H[2])
-{
-	u128 V;
-#if defined(OPENSSL_SMALL_FOOTPRINT)
-	int  i;
-#endif
-
-	Htable[0].hi = 0;
-	Htable[0].lo = 0;
-	V.hi = H[0];
-	V.lo = H[1];
-
-#if defined(OPENSSL_SMALL_FOOTPRINT)
-	for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
-		REDUCE1BIT(V);
-		Htable[i] = V;
-	}
-
-	for (i = 2; i < 16; i <<= 1) {
-		u128 *Hi = Htable + i;
-		int   j;
-		for (V = *Hi, j = 1; j < i; ++j) {
-			Hi[j].hi = V.hi ^ Htable[j].hi;
-			Hi[j].lo = V.lo ^ Htable[j].lo;
-		}
-	}
-#else
-	Htable[8] = V;
-	REDUCE1BIT(V);
-	Htable[4] = V;
-	REDUCE1BIT(V);
-	Htable[2] = V;
-	REDUCE1BIT(V);
-	Htable[1] = V;
-	Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
-	V = Htable[4];
-	Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
-	Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
-	Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
-	V = Htable[8];
-	Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
-	Htable[10].hi = V.hi ^ Htable[2].hi,
-	    Htable[10].lo = V.lo ^ Htable[2].lo;
-	Htable[11].hi = V.hi ^ Htable[3].hi,
-	    Htable[11].lo = V.lo ^ Htable[3].lo;
-	Htable[12].hi = V.hi ^ Htable[4].hi,
-	    Htable[12].lo = V.lo ^ Htable[4].lo;
-	Htable[13].hi = V.hi ^ Htable[5].hi,
-	    Htable[13].lo = V.lo ^ Htable[5].lo;
-	Htable[14].hi = V.hi ^ Htable[6].hi,
-	    Htable[14].lo = V.lo ^ Htable[6].lo;
-	Htable[15].hi = V.hi ^ Htable[7].hi,
-	    Htable[15].lo = V.lo ^ Htable[7].lo;
-#endif
-#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
-	/*
-	 * ARM assembler expects specific dword order in Htable.
-	 */
-	{
-		int j;
-#if BYTE_ORDER == LITTLE_ENDIAN
-		for (j = 0; j < 16; ++j) {
-			V = Htable[j];
-			Htable[j].hi = V.lo;
-			Htable[j].lo = V.hi;
-		}
-#else /* BIG_ENDIAN */
-		for (j = 0; j < 16; ++j) {
-			V = Htable[j];
-			Htable[j].hi = V.lo << 32|V.lo >> 32;
-			Htable[j].lo = V.hi << 32|V.hi >> 32;
-		}
-#endif
-	}
-#endif
-}
-
-#ifndef GHASH_ASM
-static const size_t rem_4bit[16] = {
-	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
-	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
-	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
-	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
-
-static void
-gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
-{
-	u128 Z;
-	int cnt = 15;
-	size_t rem, nlo, nhi;
-
-	nlo = ((const u8 *)Xi)[15];
-	nhi = nlo >> 4;
-	nlo &= 0xf;
-
-	Z.hi = Htable[nlo].hi;
-	Z.lo = Htable[nlo].lo;
-
-	while (1) {
-		rem = (size_t)Z.lo & 0xf;
-		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
-		Z.hi = (Z.hi >> 4);
-#if SIZE_MAX == 0xffffffffffffffff
-		Z.hi ^= rem_4bit[rem];
-#else
-		Z.hi ^= (u64)rem_4bit[rem] << 32;
-#endif
-		Z.hi ^= Htable[nhi].hi;
-		Z.lo ^= Htable[nhi].lo;
-
-		if (--cnt < 0)
-			break;
-
-		nlo = ((const u8 *)Xi)[cnt];
-		nhi = nlo >> 4;
-		nlo &= 0xf;
-
-		rem = (size_t)Z.lo & 0xf;
-		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
-		Z.hi = (Z.hi >> 4);
-#if SIZE_MAX == 0xffffffffffffffff
-		Z.hi ^= rem_4bit[rem];
-#else
-		Z.hi ^= (u64)rem_4bit[rem] << 32;
-#endif
-		Z.hi ^= Htable[nlo].hi;
-		Z.lo ^= Htable[nlo].lo;
-	}
-
-	Xi[0] = htobe64(Z.hi);
-	Xi[1] = htobe64(Z.lo);
-}
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-/*
- * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
- * details... Compiler-generated code doesn't seem to give any
- * performance improvement, at least not on x86[_64]. It's here
- * mostly as reference and a placeholder for possible future
- * non-trivial optimization[s]...
- */
-static void
-gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
-    const u8 *inp, size_t len)
-{
-	u128 Z;
-	int cnt;
-	size_t rem, nlo, nhi;
-
-#if 1
-	do {
-		cnt = 15;
-		nlo = ((const u8 *)Xi)[15];
-		nlo ^= inp[15];
-		nhi = nlo >> 4;
-		nlo &= 0xf;
-
-		Z.hi = Htable[nlo].hi;
-		Z.lo = Htable[nlo].lo;
-
-		while (1) {
-			rem = (size_t)Z.lo & 0xf;
-			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
-			Z.hi = (Z.hi >> 4);
-#if SIZE_MAX == 0xffffffffffffffff
-			Z.hi ^= rem_4bit[rem];
-#else
-			Z.hi ^= (u64)rem_4bit[rem] << 32;
-#endif
-			Z.hi ^= Htable[nhi].hi;
-			Z.lo ^= Htable[nhi].lo;
-
-			if (--cnt < 0)
-				break;
-
-			nlo = ((const u8 *)Xi)[cnt];
-			nlo ^= inp[cnt];
-			nhi = nlo >> 4;
-			nlo &= 0xf;
-
-			rem = (size_t)Z.lo & 0xf;
-			Z.lo = (Z.hi << 60)|(Z.lo >> 4);
-			Z.hi = (Z.hi >> 4);
-#if SIZE_MAX == 0xffffffffffffffff
-			Z.hi ^= rem_4bit[rem];
-#else
-			Z.hi ^= (u64)rem_4bit[rem] << 32;
-#endif
-			Z.hi ^= Htable[nlo].hi;
-			Z.lo ^= Htable[nlo].lo;
-		}
-#else
-    /*
-     * Extra 256+16 bytes per-key plus 512 bytes shared tables
-     * [should] give ~50% improvement... One could have PACK()-ed
-     * the rem_8bit even here, but the priority is to minimize
-     * cache footprint...
-     */
-	u128 Hshr4[16];	/* Htable shifted right by 4 bits */
-	u8 Hshl4[16];	/* Htable shifted left  by 4 bits */
-	static const unsigned short rem_8bit[256] = {
-		0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
-		0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
-		0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
-		0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
-		0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
-		0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
-		0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
-		0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
-		0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
-		0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
-		0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
-		0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
-		0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
-		0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
-		0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
-		0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
-		0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
-		0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
-		0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
-		0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
-		0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
-		0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
-		0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
-		0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
-		0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
-		0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
-		0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
-		0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
-		0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
-		0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
-		0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
-		0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
-    /*
-     * This pre-processing phase slows down procedure by approximately
-     * same time as it makes each loop spin faster. In other words
-     * single block performance is approximately same as straightforward
-     * "4-bit" implementation, and then it goes only faster...
-     */
-	for (cnt = 0; cnt < 16; ++cnt) {
-		Z.hi = Htable[cnt].hi;
-		Z.lo = Htable[cnt].lo;
-		Hshr4[cnt].lo = (Z.hi << 60)|(Z.lo >> 4);
-		Hshr4[cnt].hi = (Z.hi >> 4);
-		Hshl4[cnt] = (u8)(Z.lo << 4);
-	}
-
-	do {
-		for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
-			nlo = ((const u8 *)Xi)[cnt];
-			nlo ^= inp[cnt];
-			nhi = nlo >> 4;
-			nlo &= 0xf;
-
-			Z.hi ^= Htable[nlo].hi;
-			Z.lo ^= Htable[nlo].lo;
-
-			rem = (size_t)Z.lo & 0xff;
-
-			Z.lo = (Z.hi << 56)|(Z.lo >> 8);
-			Z.hi = (Z.hi >> 8);
-
-			Z.hi ^= Hshr4[nhi].hi;
-			Z.lo ^= Hshr4[nhi].lo;
-			Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
-		}
-
-		nlo = ((const u8 *)Xi)[0];
-		nlo ^= inp[0];
-		nhi = nlo >> 4;
-		nlo &= 0xf;
-
-		Z.hi ^= Htable[nlo].hi;
-		Z.lo ^= Htable[nlo].lo;
-
-		rem = (size_t)Z.lo & 0xf;
-
-		Z.lo = (Z.hi << 60)|(Z.lo >> 4);
-		Z.hi = (Z.hi >> 4);
-
-		Z.hi ^= Htable[nhi].hi;
-		Z.lo ^= Htable[nhi].lo;
-		Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
-#endif
-
-		Xi[0] = htobe64(Z.hi);
-		Xi[1] = htobe64(Z.lo);
-	} while (inp += 16, len -= 16);
-}
-#endif
-#else
-void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
-void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
-    size_t len);
-#endif
-
-#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
-#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
-/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
- * trashing effect. In other words idea is to hash data while it's
- * still in L1 cache after encryption pass... */
-#define GHASH_CHUNK       (3*1024)
-#endif
-
-#else	/* TABLE_BITS */
-
-static void
-gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
-{
-	u128 V, Z = { 0,0 };
-	long X;
-	int i, j;
-	const long *xi = (const long *)Xi;
-
-	V.hi = H[0];	/* H is in host byte order, no byte swapping */
-	V.lo = H[1];
-
-	for (j = 0; j < 16/sizeof(long); ++j) {
-#if BYTE_ORDER == LITTLE_ENDIAN
-#if SIZE_MAX == 0xffffffffffffffff
-#ifdef BSWAP8
-		X = (long)(BSWAP8(xi[j]));
-#else
-		const u8 *p = (const u8 *)(xi + j);
-		X = (long)((u64)GETU32(p) << 32|GETU32(p + 4));
-#endif
-#else
-		const u8 *p = (const u8 *)(xi + j);
-		X = (long)GETU32(p);
-#endif
-#else /* BIG_ENDIAN */
-		X = xi[j];
-#endif
-
-		for (i = 0; i < 8*sizeof(long); ++i, X <<= 1) {
-			u64 M = (u64)(X >> (8*sizeof(long) - 1));
-			Z.hi ^= V.hi & M;
-			Z.lo ^= V.lo & M;
-
-			REDUCE1BIT(V);
-		}
-	}
-
-	Xi[0] = htobe64(Z.hi);
-	Xi[1] = htobe64(Z.lo);
-}
-#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
-
-#endif
-
-#if	defined(GHASH_ASM) &&						\
-	(defined(__i386)	|| defined(__i386__)	||		\
-	 defined(__x86_64)	|| defined(__x86_64__)	||		\
-	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
-#include "x86_arch.h"
-#endif
-
-#if	TABLE_BITS==4 && defined(GHASH_ASM)
-# if	(defined(__i386)	|| defined(__i386__)	||		\
-	 defined(__x86_64)	|| defined(__x86_64__)	||		\
-	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
-#  define GHASH_ASM_X86_OR_64
-#  define GCM_FUNCREF_4BIT
-
-void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
-void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
-void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
-    size_t len);
-
-#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
-#   define GHASH_ASM_X86
-void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
-void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
-    size_t len);
-
-void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
-void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
-    size_t len);
-#  endif
-# elif defined(__arm__) || defined(__arm)
-#  include "arm_arch.h"
-#  if __ARM_ARCH__>=7 && !defined(__STRICT_ALIGNMENT)
-#   define GHASH_ASM_ARM
-#   define GCM_FUNCREF_4BIT
-void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
-void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
-    size_t len);
-#  endif
-# endif
-#endif
-
-#ifdef GCM_FUNCREF_4BIT
-# undef  GCM_MUL
-# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
-# ifdef GHASH
-#  undef  GHASH
-#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
-# endif
-#endif
-
-void
-CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	ctx->block = block;
-	ctx->key = key;
-
-	(*block)(ctx->H.c, ctx->H.c, key);
-
-	/* H is stored in host byte order */
-	ctx->H.u[0] = be64toh(ctx->H.u[0]);
-	ctx->H.u[1] = be64toh(ctx->H.u[1]);
-
-#if	TABLE_BITS==8
-	gcm_init_8bit(ctx->Htable, ctx->H.u);
-#elif	TABLE_BITS==4
-# if	defined(GHASH_ASM_X86_OR_64)
-#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
-	/* check FXSR and PCLMULQDQ bits */
-	if ((crypto_cpu_caps_ia32() & (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) ==
-	    (CPUCAP_MASK_FXSR | CPUCAP_MASK_PCLMUL)) {
-		gcm_init_clmul(ctx->Htable, ctx->H.u);
-		ctx->gmult = gcm_gmult_clmul;
-		ctx->ghash = gcm_ghash_clmul;
-		return;
-	}
-#  endif
-	gcm_init_4bit(ctx->Htable, ctx->H.u);
-#  if	defined(GHASH_ASM_X86)			/* x86 only */
-#   if	defined(OPENSSL_IA32_SSE2)
-	if (crypto_cpu_caps_ia32() & CPUCAP_MASK_SSE) {	/* check SSE bit */
-#   else
-	if (crypto_cpu_caps_ia32() & CPUCAP_MASK_MMX) {	/* check MMX bit */
-#   endif
-		ctx->gmult = gcm_gmult_4bit_mmx;
-		ctx->ghash = gcm_ghash_4bit_mmx;
-	} else {
-		ctx->gmult = gcm_gmult_4bit_x86;
-		ctx->ghash = gcm_ghash_4bit_x86;
-	}
-#  else
-	ctx->gmult = gcm_gmult_4bit;
-	ctx->ghash = gcm_ghash_4bit;
-#  endif
-# elif	defined(GHASH_ASM_ARM)
-	if (OPENSSL_armcap_P & ARMV7_NEON) {
-		ctx->gmult = gcm_gmult_neon;
-		ctx->ghash = gcm_ghash_neon;
-	} else {
-		gcm_init_4bit(ctx->Htable, ctx->H.u);
-		ctx->gmult = gcm_gmult_4bit;
-		ctx->ghash = gcm_ghash_4bit;
-	}
-# else
-	gcm_init_4bit(ctx->Htable, ctx->H.u);
-# endif
-#endif
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_init);
-
-void
-CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, size_t len)
-{
-	unsigned int ctr;
-#ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-#endif
-
-	ctx->Yi.u[0] = 0;
-	ctx->Yi.u[1] = 0;
-	ctx->Xi.u[0] = 0;
-	ctx->Xi.u[1] = 0;
-	ctx->len.u[0] = 0;	/* AAD length */
-	ctx->len.u[1] = 0;	/* message length */
-	ctx->ares = 0;
-	ctx->mres = 0;
-
-	if (len == 12) {
-		memcpy(ctx->Yi.c, iv, 12);
-		ctx->Yi.c[15] = 1;
-		ctr = 1;
-	} else {
-		size_t i;
-		u64 len0 = len;
-
-		while (len >= 16) {
-			for (i = 0; i < 16; ++i)
-				ctx->Yi.c[i] ^= iv[i];
-			GCM_MUL(ctx, Yi);
-			iv += 16;
-			len -= 16;
-		}
-		if (len) {
-			for (i = 0; i < len; ++i)
-				ctx->Yi.c[i] ^= iv[i];
-			GCM_MUL(ctx, Yi);
-		}
-		len0 <<= 3;
-		ctx->Yi.u[1] ^= htobe64(len0);
-
-		GCM_MUL(ctx, Yi);
-
-		ctr = be32toh(ctx->Yi.d[3]);
-	}
-
-	(*ctx->block)(ctx->Yi.c, ctx->EK0.c, ctx->key);
-	++ctr;
-	ctx->Yi.d[3] = htobe32(ctr);
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_setiv);
-
-int
-CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, size_t len)
-{
-	size_t i;
-	unsigned int n;
-	u64 alen = ctx->len.u[0];
-#ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
-	    const u8 *inp, size_t len) = ctx->ghash;
-# endif
-#endif
-
-	if (ctx->len.u[1])
-		return -2;
-
-	alen += len;
-	if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
-		return -1;
-	ctx->len.u[0] = alen;
-
-	n = ctx->ares;
-	if (n) {
-		while (n && len) {
-			ctx->Xi.c[n] ^= *(aad++);
-			--len;
-			n = (n + 1) % 16;
-		}
-		if (n == 0)
-			GCM_MUL(ctx, Xi);
-		else {
-			ctx->ares = n;
-			return 0;
-		}
-	}
-
-#ifdef GHASH
-	if ((i = (len & (size_t)-16))) {
-		GHASH(ctx, aad, i);
-		aad += i;
-		len -= i;
-	}
-#else
-	while (len >= 16) {
-		for (i = 0; i < 16; ++i)
-			ctx->Xi.c[i] ^= aad[i];
-		GCM_MUL(ctx, Xi);
-		aad += 16;
-		len -= 16;
-	}
-#endif
-	if (len) {
-		n = (unsigned int)len;
-		for (i = 0; i < len; ++i)
-			ctx->Xi.c[i] ^= aad[i];
-	}
-
-	ctx->ares = n;
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_aad);
-
-int
-CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len)
-{
-	unsigned int n, ctr;
-	size_t i;
-	u64 mlen = ctx->len.u[1];
-	block128_f block = ctx->block;
-	void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
-	    const u8 *inp, size_t len) = ctx->ghash;
-# endif
-#endif
-
-	mlen += len;
-	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
-		return -1;
-	ctx->len.u[1] = mlen;
-
-	if (ctx->ares) {
-		/* First call to encrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx, Xi);
-		ctx->ares = 0;
-	}
-
-	ctr = be32toh(ctx->Yi.d[3]);
-
-	n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16 % sizeof(size_t) == 0)
-		do {	/* always true actually */
-			if (n) {
-				while (n && len) {
-					ctx->Xi.c[n] ^= *(out++) = *(in++) ^
-					    ctx->EKi.c[n];
-					--len;
-					n = (n + 1) % 16;
-				}
-				if (n == 0)
-					GCM_MUL(ctx, Xi);
-				else {
-					ctx->mres = n;
-					return 0;
-				}
-			}
-#ifdef __STRICT_ALIGNMENT
-			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
-				break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
-			while (len >= GHASH_CHUNK) {
-				size_t j = GHASH_CHUNK;
-
-				while (j) {
-					size_t *out_t = (size_t *)out;
-					const size_t *in_t = (const size_t *)in;
-
-					(*block)(ctx->Yi.c, ctx->EKi.c, key);
-					++ctr;
-					ctx->Yi.d[3] = htobe32(ctr);
-
-					for (i = 0; i < 16/sizeof(size_t); ++i)
-						out_t[i] = in_t[i] ^
-						    ctx->EKi.t[i];
-					out += 16;
-					in += 16;
-					j -= 16;
-				}
-				GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
-				len -= GHASH_CHUNK;
-			}
-			if ((i = (len & (size_t)-16))) {
-				size_t j = i;
-
-				while (len >= 16) {
-					size_t *out_t = (size_t *)out;
-					const size_t *in_t = (const size_t *)in;
-
-					(*block)(ctx->Yi.c, ctx->EKi.c, key);
-					++ctr;
-					ctx->Yi.d[3] = htobe32(ctr);
-
-					for (i = 0; i < 16/sizeof(size_t); ++i)
-						out_t[i] = in_t[i] ^
-						    ctx->EKi.t[i];
-					out += 16;
-					in += 16;
-					len -= 16;
-				}
-				GHASH(ctx, out - j, j);
-			}
-#else
-			while (len >= 16) {
-				size_t *out_t = (size_t *)out;
-				const size_t *in_t = (const size_t *)in;
-
-				(*block)(ctx->Yi.c, ctx->EKi.c, key);
-				++ctr;
-				ctx->Yi.d[3] = htobe32(ctr);
-
-				for (i = 0; i < 16/sizeof(size_t); ++i)
-					ctx->Xi.t[i] ^=
-					    out_t[i] = in_t[i] ^ ctx->EKi.t[i];
-				GCM_MUL(ctx, Xi);
-				out += 16;
-				in += 16;
-				len -= 16;
-			}
-#endif
-			if (len) {
-				(*block)(ctx->Yi.c, ctx->EKi.c, key);
-				++ctr;
-				ctx->Yi.d[3] = htobe32(ctr);
-
-				while (len--) {
-					ctx->Xi.c[n] ^= out[n] = in[n] ^
-					    ctx->EKi.c[n];
-					++n;
-				}
-			}
-
-			ctx->mres = n;
-			return 0;
-		} while (0);
-#endif
-	for (i = 0; i < len; ++i) {
-		if (n == 0) {
-			(*block)(ctx->Yi.c, ctx->EKi.c, key);
-			++ctr;
-			ctx->Yi.d[3] = htobe32(ctr);
-		}
-		ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
-		n = (n + 1) % 16;
-		if (n == 0)
-			GCM_MUL(ctx, Xi);
-	}
-
-	ctx->mres = n;
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt);
-
-int
-CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len)
-{
-	unsigned int n, ctr;
-	size_t i;
-	u64 mlen = ctx->len.u[1];
-	block128_f block = ctx->block;
-	void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
-	    const u8 *inp, size_t len) = ctx->ghash;
-# endif
-#endif
-
-	mlen += len;
-	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
-		return -1;
-	ctx->len.u[1] = mlen;
-
-	if (ctx->ares) {
-		/* First call to decrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx, Xi);
-		ctx->ares = 0;
-	}
-
-	ctr = be32toh(ctx->Yi.d[3]);
-
-	n = ctx->mres;
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16 % sizeof(size_t) == 0)
-		do {	/* always true actually */
-			if (n) {
-				while (n && len) {
-					u8 c = *(in++);
-					*(out++) = c ^ ctx->EKi.c[n];
-					ctx->Xi.c[n] ^= c;
-					--len;
-					n = (n + 1) % 16;
-				}
-				if (n == 0)
-					GCM_MUL(ctx, Xi);
-				else {
-					ctx->mres = n;
-					return 0;
-				}
-			}
-#ifdef __STRICT_ALIGNMENT
-			if (((size_t)in|(size_t)out) % sizeof(size_t) != 0)
-				break;
-#endif
-#if defined(GHASH) && defined(GHASH_CHUNK)
-			while (len >= GHASH_CHUNK) {
-				size_t j = GHASH_CHUNK;
-
-				GHASH(ctx, in, GHASH_CHUNK);
-				while (j) {
-					size_t *out_t = (size_t *)out;
-					const size_t *in_t = (const size_t *)in;
-
-					(*block)(ctx->Yi.c, ctx->EKi.c, key);
-					++ctr;
-					ctx->Yi.d[3] = htobe32(ctr);
-
-					for (i = 0; i < 16/sizeof(size_t); ++i)
-						out_t[i] = in_t[i] ^
-						    ctx->EKi.t[i];
-					out += 16;
-					in += 16;
-					j -= 16;
-				}
-				len -= GHASH_CHUNK;
-			}
-			if ((i = (len & (size_t)-16))) {
-				GHASH(ctx, in, i);
-				while (len >= 16) {
-					size_t *out_t = (size_t *)out;
-					const size_t *in_t = (const size_t *)in;
-
-					(*block)(ctx->Yi.c, ctx->EKi.c, key);
-					++ctr;
-					ctx->Yi.d[3] = htobe32(ctr);
-
-					for (i = 0; i < 16/sizeof(size_t); ++i)
-						out_t[i] = in_t[i] ^
-						    ctx->EKi.t[i];
-					out += 16;
-					in += 16;
-					len -= 16;
-				}
-			}
-#else
-			while (len >= 16) {
-				size_t *out_t = (size_t *)out;
-				const size_t *in_t = (const size_t *)in;
-
-				(*block)(ctx->Yi.c, ctx->EKi.c, key);
-				++ctr;
-				ctx->Yi.d[3] = htobe32(ctr);
-
-				for (i = 0; i < 16/sizeof(size_t); ++i) {
-					size_t c = in[i];
-					out[i] = c ^ ctx->EKi.t[i];
-					ctx->Xi.t[i] ^= c;
-				}
-				GCM_MUL(ctx, Xi);
-				out += 16;
-				in += 16;
-				len -= 16;
-			}
-#endif
-			if (len) {
-				(*block)(ctx->Yi.c, ctx->EKi.c, key);
-				++ctr;
-				ctx->Yi.d[3] = htobe32(ctr);
-
-				while (len--) {
-					u8 c = in[n];
-					ctx->Xi.c[n] ^= c;
-					out[n] = c ^ ctx->EKi.c[n];
-					++n;
-				}
-			}
-
-			ctx->mres = n;
-			return 0;
-		} while (0);
-#endif
-	for (i = 0; i < len; ++i) {
-		u8 c;
-		if (n == 0) {
-			(*block)(ctx->Yi.c, ctx->EKi.c, key);
-			++ctr;
-			ctx->Yi.d[3] = htobe32(ctr);
-		}
-		c = in[i];
-		out[i] = c ^ ctx->EKi.c[n];
-		ctx->Xi.c[n] ^= c;
-		n = (n + 1) % 16;
-		if (n == 0)
-			GCM_MUL(ctx, Xi);
-	}
-
-	ctx->mres = n;
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt);
-
-int
-CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len, ctr128_f stream)
-{
-	unsigned int n, ctr;
-	size_t i;
-	u64 mlen = ctx->len.u[1];
-	void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
-	    const u8 *inp, size_t len) = ctx->ghash;
-# endif
-#endif
-
-	mlen += len;
-	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
-		return -1;
-	ctx->len.u[1] = mlen;
-
-	if (ctx->ares) {
-		/* First call to encrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx, Xi);
-		ctx->ares = 0;
-	}
-
-	ctr = be32toh(ctx->Yi.d[3]);
-
-	n = ctx->mres;
-	if (n) {
-		while (n && len) {
-			ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
-			--len;
-			n = (n + 1) % 16;
-		}
-		if (n == 0)
-			GCM_MUL(ctx, Xi);
-		else {
-			ctx->mres = n;
-			return 0;
-		}
-	}
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
-	while (len >= GHASH_CHUNK) {
-		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
-		ctr += GHASH_CHUNK/16;
-		ctx->Yi.d[3] = htobe32(ctr);
-		GHASH(ctx, out, GHASH_CHUNK);
-		out += GHASH_CHUNK;
-		in += GHASH_CHUNK;
-		len -= GHASH_CHUNK;
-	}
-#endif
-	if ((i = (len & (size_t)-16))) {
-		size_t j = i/16;
-
-		(*stream)(in, out, j, key, ctx->Yi.c);
-		ctr += (unsigned int)j;
-		ctx->Yi.d[3] = htobe32(ctr);
-		in += i;
-		len -= i;
-#if defined(GHASH)
-		GHASH(ctx, out, i);
-		out += i;
-#else
-		while (j--) {
-			for (i = 0; i < 16; ++i)
-				ctx->Xi.c[i] ^= out[i];
-			GCM_MUL(ctx, Xi);
-			out += 16;
-		}
-#endif
-	}
-	if (len) {
-		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
-		++ctr;
-		ctx->Yi.d[3] = htobe32(ctr);
-		while (len--) {
-			ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
-			++n;
-		}
-	}
-
-	ctx->mres = n;
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_encrypt_ctr32);
-
-int
-CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len, ctr128_f stream)
-{
-	unsigned int n, ctr;
-	size_t i;
-	u64 mlen = ctx->len.u[1];
-	void *key = ctx->key;
-#ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-# ifdef GHASH
-	void (*gcm_ghash_p)(u64 Xi[2], const u128 Htable[16],
-	    const u8 *inp, size_t len) = ctx->ghash;
-# endif
-#endif
-
-	mlen += len;
-	if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
-		return -1;
-	ctx->len.u[1] = mlen;
-
-	if (ctx->ares) {
-		/* First call to decrypt finalizes GHASH(AAD) */
-		GCM_MUL(ctx, Xi);
-		ctx->ares = 0;
-	}
-
-	ctr = be32toh(ctx->Yi.d[3]);
-
-	n = ctx->mres;
-	if (n) {
-		while (n && len) {
-			u8 c = *(in++);
-			*(out++) = c ^ ctx->EKi.c[n];
-			ctx->Xi.c[n] ^= c;
-			--len;
-			n = (n + 1) % 16;
-		}
-		if (n == 0)
-			GCM_MUL(ctx, Xi);
-		else {
-			ctx->mres = n;
-			return 0;
-		}
-	}
-#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
-	while (len >= GHASH_CHUNK) {
-		GHASH(ctx, in, GHASH_CHUNK);
-		(*stream)(in, out, GHASH_CHUNK/16, key, ctx->Yi.c);
-		ctr += GHASH_CHUNK/16;
-		ctx->Yi.d[3] = htobe32(ctr);
-		out += GHASH_CHUNK;
-		in += GHASH_CHUNK;
-		len -= GHASH_CHUNK;
-	}
-#endif
-	if ((i = (len & (size_t)-16))) {
-		size_t j = i/16;
-
-#if defined(GHASH)
-		GHASH(ctx, in, i);
-#else
-		while (j--) {
-			size_t k;
-			for (k = 0; k < 16; ++k)
-				ctx->Xi.c[k] ^= in[k];
-			GCM_MUL(ctx, Xi);
-			in += 16;
-		}
-		j = i/16;
-		in -= i;
-#endif
-		(*stream)(in, out, j, key, ctx->Yi.c);
-		ctr += (unsigned int)j;
-		ctx->Yi.d[3] = htobe32(ctr);
-		out += i;
-		in += i;
-		len -= i;
-	}
-	if (len) {
-		(*ctx->block)(ctx->Yi.c, ctx->EKi.c, key);
-		++ctr;
-		ctx->Yi.d[3] = htobe32(ctr);
-		while (len--) {
-			u8 c = in[n];
-			ctx->Xi.c[n] ^= c;
-			out[n] = c ^ ctx->EKi.c[n];
-			++n;
-		}
-	}
-
-	ctx->mres = n;
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_decrypt_ctr32);
-
-int
-CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
-    size_t len)
-{
-	u64 alen = ctx->len.u[0] << 3;
-	u64 clen = ctx->len.u[1] << 3;
-#ifdef GCM_FUNCREF_4BIT
-	void (*gcm_gmult_p)(u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
-#endif
-
-	if (ctx->mres || ctx->ares)
-		GCM_MUL(ctx, Xi);
-
-	ctx->Xi.u[0] ^= htobe64(alen);
-	ctx->Xi.u[1] ^= htobe64(clen);
-	GCM_MUL(ctx, Xi);
-
-	ctx->Xi.u[0] ^= ctx->EK0.u[0];
-	ctx->Xi.u[1] ^= ctx->EK0.u[1];
-
-	if (tag && len <= sizeof(ctx->Xi))
-		return memcmp(ctx->Xi.c, tag, len);
-	else
-		return -1;
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_finish);
-
-void
-CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
-{
-	CRYPTO_gcm128_finish(ctx, NULL, 0);
-	memcpy(tag, ctx->Xi.c,
-	    len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_tag);
-
-GCM128_CONTEXT *
-CRYPTO_gcm128_new(void *key, block128_f block)
-{
-	GCM128_CONTEXT *ret;
-
-	if ((ret = malloc(sizeof(GCM128_CONTEXT))))
-		CRYPTO_gcm128_init(ret, key, block);
-
-	return ret;
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_new);
-
-void
-CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
-{
-	freezero(ctx, sizeof(*ctx));
-}
-LCRYPTO_ALIAS(CRYPTO_gcm128_release);
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
deleted file mode 100644
index 53fa9afb0d..0000000000
--- a/src/lib/libcrypto/modes/modes.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* $OpenBSD: modes.h,v 1.6 2023/07/08 14:55:36 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project. All rights reserved.
- *
- * Rights for redistribution and usage in source and binary
- * forms are granted according to the OpenSSL license.
- */
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*block128_f)(const unsigned char in[16],
-    unsigned char out[16],
-    const void *key);
-
-typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], int enc);
-
-typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
-    size_t blocks, const void *key,
-    const unsigned char ivec[16]);
-
-typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
-    size_t blocks, const void *key,
-    const unsigned char ivec[16], unsigned char cmac[16]);
-
-void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], block128_f block);
-void CRYPTO_cbc128_decrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], block128_f block);
-
-void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], unsigned char ecount_buf[16],
-    unsigned int *num, block128_f block);
-
-void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], unsigned char ecount_buf[16],
-    unsigned int *num, ctr128_f ctr);
-
-void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], int *num,
-    block128_f block);
-
-void CRYPTO_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], int *num,
-    int enc, block128_f block);
-void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out,
-    size_t length, const void *key,
-    unsigned char ivec[16], int *num,
-    int enc, block128_f block);
-void CRYPTO_cfb128_1_encrypt(const unsigned char *in, unsigned char *out,
-    size_t bits, const void *key,
-    unsigned char ivec[16], int *num,
-    int enc, block128_f block);
-
-typedef struct gcm128_context GCM128_CONTEXT;
-
-GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
-void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block);
-void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
-    size_t len);
-int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
-    size_t len);
-int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len);
-int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len);
-int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len, ctr128_f stream);
-int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
-    const unsigned char *in, unsigned char *out,
-    size_t len, ctr128_f stream);
-int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
-    size_t len);
-void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
-void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
-
-typedef struct ccm128_context CCM128_CONTEXT;
-
-void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
-    unsigned int M, unsigned int L, void *key, block128_f block);
-int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
-    const unsigned char *nonce, size_t nlen, size_t mlen);
-void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
-    const unsigned char *aad, size_t alen);
-int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out, size_t len);
-int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out, size_t len,
-    ccm128_f stream);
-int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
-    const unsigned char *inp, unsigned char *out, size_t len,
-    ccm128_f stream);
-size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
-
-typedef struct xts128_context XTS128_CONTEXT;
-
-int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
-    const unsigned char *inp, unsigned char *out, size_t len, int enc);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/lib/libcrypto/modes/modes_local.h b/src/lib/libcrypto/modes/modes_local.h
deleted file mode 100644
index 511855f2e0..0000000000
--- a/src/lib/libcrypto/modes/modes_local.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* $OpenBSD: modes_local.h,v 1.2 2023/07/08 14:55:36 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use is governed by OpenSSL license.
- * ====================================================================
- */
-
-#include <endian.h>
-
-#include <openssl/opensslconf.h>
-
-#include <openssl/modes.h>
-
-__BEGIN_HIDDEN_DECLS
-
-#if defined(_LP64)
-typedef long i64;
-typedef unsigned long u64;
-#define U64(C) C##UL
-#else
-typedef long long i64;
-typedef unsigned long long u64;
-#define U64(C) C##ULL
-#endif
-
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
-#if defined(__GNUC__) && __GNUC__>=2
-# if defined(__x86_64) || defined(__x86_64__)
-#  define BSWAP8(x) ({	u64 ret=(x);					\
-			asm ("bswapq %0"				\
-			: "+r"(ret));	ret;		})
-#  define BSWAP4(x) ({	u32 ret=(x);					\
-			asm ("bswapl %0"				\
-			: "+r"(ret));	ret;		})
-# elif (defined(__i386) || defined(__i386__))
-#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);			\
-			asm ("bswapl %0; bswapl %1"			\
-			: "+r"(hi),"+r"(lo));				\
-			(u64)hi<<32|lo;			})
-#  define BSWAP4(x) ({	u32 ret=(x);					\
-			asm ("bswapl %0"				\
-			: "+r"(ret));	ret;		})
-# elif (defined(__arm__) || defined(__arm)) && !defined(__STRICT_ALIGNMENT)
-#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);			\
-			asm ("rev %0,%0; rev %1,%1"			\
-			: "+r"(hi),"+r"(lo));				\
-			(u64)hi<<32|lo;			})
-#  define BSWAP4(x) ({	u32 ret;					\
-			asm ("rev %0,%1"				\
-			: "=r"(ret) : "r"((u32)(x)));			\
-			ret;				})
-# endif
-#endif
-#endif
-
-#if defined(BSWAP4) && !defined(__STRICT_ALIGNMENT)
-#define GETU32(p)	BSWAP4(*(const u32 *)(p))
-#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
-#else
-#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
-#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
-#endif
-
-/* GCM definitions */
-
-typedef struct {
-	u64 hi, lo;
-} u128;
-
-#ifdef	TABLE_BITS
-#undef	TABLE_BITS
-#endif
-/*
- * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
- * never be set to 8 [or 1]. For further information see gcm128.c.
- */
-#define	TABLE_BITS 4
-
-struct gcm128_context {
-	/* Following 6 names follow names in GCM specification */
-	union {
-		u64 u[2];
-		u32 d[4];
-		u8 c[16];
-		size_t t[16/sizeof(size_t)];
-	} Yi, EKi, EK0, len, Xi, H;
-	/* Relative position of Xi, H and pre-computed Htable is used
-	 * in some assembler modules, i.e. don't change the order! */
-#if TABLE_BITS==8
-	u128 Htable[256];
-#else
-	u128 Htable[16];
-	void (*gmult)(u64 Xi[2], const u128 Htable[16]);
-	void (*ghash)(u64 Xi[2], const u128 Htable[16], const u8 *inp,
-	    size_t len);
-#endif
-	unsigned int mres, ares;
-	block128_f block;
-	void *key;
-};
-
-struct xts128_context {
-	void      *key1, *key2;
-	block128_f block1, block2;
-};
-
-struct ccm128_context {
-	union {
-		u64 u[2];
-		u8 c[16];
-	} nonce, cmac;
-	u64 blocks;
-	block128_f block;
-	void *key;
-};
-
-__END_HIDDEN_DECLS
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
deleted file mode 100644
index 42afd29d58..0000000000
--- a/src/lib/libcrypto/modes/ofb128.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/* $OpenBSD: ofb128.c,v 1.7 2023/07/08 14:56:54 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2008 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- */
-
-#include <openssl/crypto.h>
-#include "modes_local.h"
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-
-/* The input and output encrypted as though 128bit ofb mode is being
- * used.  The extra state information to record how much of the
- * 128bit block we have used is contained in *num;
- */
-void
-CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-    size_t len, const void *key,
-    unsigned char ivec[16], int *num,
-    block128_f block)
-{
-	unsigned int n;
-	size_t l = 0;
-
-	n = *num;
-
-#if !defined(OPENSSL_SMALL_FOOTPRINT)
-	if (16 % sizeof(size_t) == 0)
-		do { /* always true actually */
-			while (n && len) {
-				*(out++) = *(in++) ^ ivec[n];
-				--len;
-				n = (n + 1) % 16;
-			}
-#ifdef __STRICT_ALIGNMENT
-			if (((size_t)in|(size_t)out|(size_t)ivec) %
-			    sizeof(size_t) != 0)
-				break;
-#endif
-			while (len >= 16) {
-				(*block)(ivec, ivec, key);
-				for (; n < 16; n += sizeof(size_t))
-					*(size_t *)(out + n) =
-					    *(size_t *)(in + n) ^ *(size_t *)(ivec +
-					    n);
-				len -= 16;
-				out += 16;
-				in += 16;
-				n = 0;
-			}
-			if (len) {
-				(*block)(ivec, ivec, key);
-				while (len--) {
-					out[n] = in[n] ^ ivec[n];
-					++n;
-				}
-			}
-			*num = n;
-			return;
-		} while (0);
-	/* the rest would be commonly eliminated by x86* compiler */
-#endif
-	while (l < len) {
-		if (n == 0) {
-			(*block)(ivec, ivec, key);
-		}
-		out[l] = in[l] ^ ivec[n];
-		++l;
-		n = (n + 1) % 16;
-	}
-
-	*num = n;
-}
-LCRYPTO_ALIAS(CRYPTO_ofb128_encrypt);
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
deleted file mode 100644
index 7516acf850..0000000000
--- a/src/lib/libcrypto/modes/xts128.c
+++ /dev/null
@@ -1,197 +0,0 @@
-/* $OpenBSD: xts128.c,v 1.12 2023/07/08 14:56:54 beck Exp $ */
-/* ====================================================================
- * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    openssl-core@openssl.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- */
-
-#include <openssl/crypto.h>
-#include "modes_local.h"
-
-#include <endian.h>
-#include <string.h>
-
-#ifndef MODES_DEBUG
-# ifndef NDEBUG
-#  define NDEBUG
-# endif
-#endif
-
-int
-CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
-    const unsigned char *inp, unsigned char *out,
-    size_t len, int enc)
-{
-	union {
-		u64 u[2];
-		u32 d[4];
-		u8 c[16];
-	} tweak, scratch;
-	unsigned int i;
-
-	if (len < 16)
-		return -1;
-
-	memcpy(tweak.c, iv, 16);
-
-	(*ctx->block2)(tweak.c, tweak.c, ctx->key2);
-
-	if (!enc && (len % 16))
-		len -= 16;
-
-	while (len >= 16) {
-#ifdef __STRICT_ALIGNMENT
-		memcpy(scratch.c, inp, 16);
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-#else
-		scratch.u[0] = ((u64 *)inp)[0] ^ tweak.u[0];
-		scratch.u[1] = ((u64 *)inp)[1] ^ tweak.u[1];
-#endif
-		(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
-#ifdef __STRICT_ALIGNMENT
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		memcpy(out, scratch.c, 16);
-#else
-		((u64 *)out)[0] = scratch.u[0] ^= tweak.u[0];
-		((u64 *)out)[1] = scratch.u[1] ^= tweak.u[1];
-#endif
-		inp += 16;
-		out += 16;
-		len -= 16;
-
-		if (len == 0)
-			return 0;
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-		unsigned int carry, res;
-
-		res = 0x87 & (((int)tweak.d[3]) >> 31);
-		carry = (unsigned int)(tweak.u[0] >> 63);
-		tweak.u[0] = (tweak.u[0] << 1) ^ res;
-		tweak.u[1] = (tweak.u[1] << 1)|carry;
-#else /* BIG_ENDIAN */
-		size_t c;
-
-		for (c = 0, i = 0; i < 16; ++i) {
-			/*+ substitutes for |, because c is 1 bit */
-			c += ((size_t)tweak.c[i]) << 1;
-			tweak.c[i] = (u8)c;
-			c = c >> 8;
-		}
-		tweak.c[0] ^= (u8)(0x87 & (0 - c));
-#endif
-	}
-	if (enc) {
-		for (i = 0; i < len; ++i) {
-			u8 ch = inp[i];
-			out[i] = scratch.c[i];
-			scratch.c[i] = ch;
-		}
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		memcpy(out - 16, scratch.c, 16);
-	} else {
-		union {
-			u64 u[2];
-			u8 c[16];
-		} tweak1;
-
-#if BYTE_ORDER == LITTLE_ENDIAN
-		unsigned int carry, res;
-
-		res = 0x87 & (((int)tweak.d[3]) >> 31);
-		carry = (unsigned int)(tweak.u[0] >> 63);
-		tweak1.u[0] = (tweak.u[0] << 1) ^ res;
-		tweak1.u[1] = (tweak.u[1] << 1)|carry;
-#else
-		size_t c;
-
-		for (c = 0, i = 0; i < 16; ++i) {
-			/*+ substitutes for |, because c is 1 bit */
-			c += ((size_t)tweak.c[i]) << 1;
-			tweak1.c[i] = (u8)c;
-			c = c >> 8;
-		}
-		tweak1.c[0] ^= (u8)(0x87 & (0 - c));
-#endif
-#ifdef __STRICT_ALIGNMENT
-		memcpy(scratch.c, inp, 16);
-		scratch.u[0] ^= tweak1.u[0];
-		scratch.u[1] ^= tweak1.u[1];
-#else
-		scratch.u[0] = ((u64 *)inp)[0] ^ tweak1.u[0];
-		scratch.u[1] = ((u64 *)inp)[1] ^ tweak1.u[1];
-#endif
-		(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
-		scratch.u[0] ^= tweak1.u[0];
-		scratch.u[1] ^= tweak1.u[1];
-
-		for (i = 0; i < len; ++i) {
-			u8 ch = inp[16 + i];
-			out[16 + i] = scratch.c[i];
-			scratch.c[i] = ch;
-		}
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		(*ctx->block1)(scratch.c, scratch.c, ctx->key1);
-#ifdef __STRICT_ALIGNMENT
-		scratch.u[0] ^= tweak.u[0];
-		scratch.u[1] ^= tweak.u[1];
-		memcpy(out, scratch.c, 16);
-#else
-		((u64 *)out)[0] = scratch.u[0] ^ tweak.u[0];
-		((u64 *)out)[1] = scratch.u[1] ^ tweak.u[1];
-#endif
-	}
-
-	return 0;
-}
-LCRYPTO_ALIAS(CRYPTO_xts128_encrypt);
-- 
cgit v1.2.3-55-g6feb