From 228cae30b117c2493f69ad3c195341cd6ec8d430 Mon Sep 17 00:00:00 2001
From: djm <>
Date: Sat, 13 Oct 2012 21:23:50 +0000
Subject: import OpenSSL-1.0.1c

---
 src/lib/libcrypto/aes/aes.h                    |    5 +
 src/lib/libcrypto/aes/aes_core.c               |   12 +-
 src/lib/libcrypto/aes/aes_misc.c               |   21 +
 src/lib/libcrypto/aes/asm/aes-586.pl           |   14 +-
 src/lib/libcrypto/aes/asm/aes-armv4.pl         |  182 +-
 src/lib/libcrypto/aes/asm/aes-mips.pl          | 1611 +++++++++++++
 src/lib/libcrypto/aes/asm/aes-parisc.pl        | 1021 ++++++++
 src/lib/libcrypto/aes/asm/aes-ppc.pl           |  444 ++--
 src/lib/libcrypto/aes/asm/aes-s390x.pl         | 1071 ++++++++-
 src/lib/libcrypto/aes/asm/aes-sparcv9.pl       |    3 +-
 src/lib/libcrypto/aes/asm/aes-x86_64.pl        |   45 +-
 src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl | 1249 ++++++++++
 src/lib/libcrypto/aes/asm/aesni-x86.pl         | 2189 +++++++++++++++++
 src/lib/libcrypto/aes/asm/aesni-x86_64.pl      | 2498 +++++++++++++++++--
 src/lib/libcrypto/aes/asm/bsaes-x86_64.pl      | 3044 ++++++++++++++++++++++++
 src/lib/libcrypto/aes/asm/vpaes-x86.pl         |  903 +++++++
 src/lib/libcrypto/aes/asm/vpaes-x86_64.pl      | 1206 ++++++++++
 src/lib/libcrypto/arm_arch.h                   |   51 +
 src/lib/libcrypto/armcap.c                     |   80 +
 src/lib/libcrypto/armv4cpuid.S                 |  154 ++
 src/lib/libcrypto/asn1/a_d2i_fp.c              |   54 +-
 src/lib/libcrypto/asn1/a_digest.c              |    6 +-
 src/lib/libcrypto/asn1/a_int.c                 |    4 +-
 src/lib/libcrypto/asn1/a_sign.c                |  111 +-
 src/lib/libcrypto/asn1/a_verify.c              |   77 +-
 src/lib/libcrypto/asn1/ameth_lib.c             |   12 +-
 src/lib/libcrypto/asn1/asn1.h                  |    8 +-
 src/lib/libcrypto/asn1/asn1_err.c              |    5 +-
 src/lib/libcrypto/asn1/asn1_locl.h             |   11 +
 src/lib/libcrypto/asn1/asn_mime.c              |   23 +-
 src/lib/libcrypto/asn1/n_pkey.c                |   38 +-
 src/lib/libcrypto/asn1/p5_pbev2.c              |  143 +-
 src/lib/libcrypto/asn1/t_crl.c                 |    3 +-
 src/lib/libcrypto/asn1/t_x509.c                |   55 +-
 src/lib/libcrypto/asn1/tasn_prn.c              |   12 +-
 src/lib/libcrypto/asn1/x_algor.c               |   14 +
 src/lib/libcrypto/asn1/x_name.c                |    3 +-
 src/lib/libcrypto/asn1/x_pubkey.c              |   11 +-
 src/lib/libcrypto/bf/bf_skey.c                 |    8 +
 src/lib/libcrypto/bf/blowfish.h                |    4 +-
 src/lib/libcrypto/bio/b_sock.c                 |    2 +-
 src/lib/libcrypto/bio/bio.h                    |   70 +-
 src/lib/libcrypto/bio/bio_err.c                |    3 +-
 src/lib/libcrypto/bio/bio_lib.c                |   28 +-
 src/lib/libcrypto/bio/bss_bio.c                |   18 +-
 src/lib/libcrypto/bio/bss_dgram.c              |  996 ++++++++
 src/lib/libcrypto/bn/asm/armv4-gf2m.pl         |  278 +++
 src/lib/libcrypto/bn/asm/armv4-mont.pl         |   23 +-
 src/lib/libcrypto/bn/asm/ia64-mont.pl          |  851 +++++++
 src/lib/libcrypto/bn/asm/mips-mont.pl          |  426 ++++
 src/lib/libcrypto/bn/asm/mips.pl               | 2585 ++++++++++++++++++++
 src/lib/libcrypto/bn/asm/modexp512-x86_64.pl   | 1496 ++++++++++++
 src/lib/libcrypto/bn/asm/parisc-mont.pl        |  993 ++++++++
 src/lib/libcrypto/bn/asm/ppc-mont.pl           |  107 +-
 src/lib/libcrypto/bn/asm/ppc.pl                |   43 +-
 src/lib/libcrypto/bn/asm/ppc64-mont.pl         |  338 ++-
 src/lib/libcrypto/bn/asm/s390x-gf2m.pl         |  221 ++
 src/lib/libcrypto/bn/asm/s390x-mont.pl         |  102 +-
 src/lib/libcrypto/bn/asm/x86-gf2m.pl           |  313 +++
 src/lib/libcrypto/bn/asm/x86_64-gf2m.pl        |  389 +++
 src/lib/libcrypto/bn/asm/x86_64-mont.pl        | 1486 +++++++++++-
 src/lib/libcrypto/bn/asm/x86_64-mont5.pl       | 1070 +++++++++
 src/lib/libcrypto/bn/bn.h                      |   15 +
 src/lib/libcrypto/bn/bn_div.c                  |  272 +--
 src/lib/libcrypto/bn/bn_exp.c                  |  240 +-
 src/lib/libcrypto/bn/bn_gf2m.c                 |  114 +-
 src/lib/libcrypto/bn/bn_lcl.h                  |   23 +-
 src/lib/libcrypto/bn/bn_lib.c                  |   19 -
 src/lib/libcrypto/bn/bn_mont.c                 |  116 +-
 src/lib/libcrypto/bn/bn_nist.c                 |  338 ++-
 src/lib/libcrypto/bn/bn_print.c                |   19 +
 src/lib/libcrypto/bn/bn_shift.c                |   27 +-
 src/lib/libcrypto/buffer/buf_str.c             |   99 +-
 src/lib/libcrypto/buffer/buffer.c              |   75 +-
 src/lib/libcrypto/camellia/asm/cmll-x86.pl     |    6 +-
 src/lib/libcrypto/camellia/camellia.h          |    4 +
 src/lib/libcrypto/camellia/cmll_locl.h         |    5 +-
 src/lib/libcrypto/camellia/cmll_misc.c         |    3 +-
 src/lib/libcrypto/cast/c_skey.c                |    9 +-
 src/lib/libcrypto/cast/cast.h                  |    4 +-
 src/lib/libcrypto/cmac/cm_ameth.c              |   97 +
 src/lib/libcrypto/cmac/cm_pmeth.c              |  224 ++
 src/lib/libcrypto/cmac/cmac.c                  |  308 +++
 src/lib/libcrypto/cmac/cmac.h                  |   82 +
 src/lib/libcrypto/cms/cms.h                    |   22 +
 src/lib/libcrypto/cms/cms_asn1.c               |    9 +
 src/lib/libcrypto/cms/cms_enc.c                |   60 +-
 src/lib/libcrypto/cms/cms_env.c                |   22 +-
 src/lib/libcrypto/cms/cms_err.c                |   13 +-
 src/lib/libcrypto/cms/cms_lcl.h                |   12 +
 src/lib/libcrypto/cms/cms_lib.c                |    3 +-
 src/lib/libcrypto/cms/cms_pwri.c               |  454 ++++
 src/lib/libcrypto/cms/cms_sd.c                 |    3 +-
 src/lib/libcrypto/cms/cms_smime.c              |   61 +-
 src/lib/libcrypto/comp/c_rle.c                 |    4 +-
 src/lib/libcrypto/cpt_err.c                    |    4 +-
 src/lib/libcrypto/cryptlib.c                   |   40 +-
 src/lib/libcrypto/cryptlib.h                   |    2 +-
 src/lib/libcrypto/crypto.h                     |   29 +
 src/lib/libcrypto/des/des.h                    |    3 +
 src/lib/libcrypto/des/set_key.c                |    9 +
 src/lib/libcrypto/dh/dh.h                      |   20 +
 src/lib/libcrypto/dh/dh_ameth.c                |    1 +
 src/lib/libcrypto/dh/dh_err.c                  |    7 +-
 src/lib/libcrypto/dh/dh_gen.c                  |   17 +
 src/lib/libcrypto/dh/dh_key.c                  |   33 +-
 src/lib/libcrypto/dh/dh_lib.c                  |   15 +-
 src/lib/libcrypto/doc/EVP_DigestInit.pod       |   66 +-
 src/lib/libcrypto/dsa/dsa.h                    |   20 +
 src/lib/libcrypto/dsa/dsa_ameth.c              |   47 +
 src/lib/libcrypto/dsa/dsa_asn1.c               |   40 +-
 src/lib/libcrypto/dsa/dsa_err.c                |    7 +-
 src/lib/libcrypto/dsa/dsa_gen.c                |   35 +-
 src/lib/libcrypto/dsa/dsa_key.c                |   16 +
 src/lib/libcrypto/dsa/dsa_lib.c                |   22 +-
 src/lib/libcrypto/dsa/dsa_locl.h               |    1 +
 src/lib/libcrypto/dsa/dsa_ossl.c               |   16 +-
 src/lib/libcrypto/dsa/dsa_pmeth.c              |    6 +-
 src/lib/libcrypto/dsa/dsa_sign.c               |   50 +-
 src/lib/libcrypto/dsa/dsa_vrf.c                |   29 +-
 src/lib/libcrypto/dso/dso_dlfcn.c              |    3 +-
 src/lib/libcrypto/ec/ec.h                      |   69 +-
 src/lib/libcrypto/ec/ec2_mult.c                |    4 +
 src/lib/libcrypto/ec/ec2_oct.c                 |  407 ++++
 src/lib/libcrypto/ec/ec2_smpl.c                |  351 +--
 src/lib/libcrypto/ec/ec_ameth.c                |    1 +
 src/lib/libcrypto/ec/ec_asn1.c                 |   24 +-
 src/lib/libcrypto/ec/ec_curve.c                |  197 +-
 src/lib/libcrypto/ec/ec_cvt.c                  |   28 +-
 src/lib/libcrypto/ec/ec_err.c                  |   20 +-
 src/lib/libcrypto/ec/ec_key.c                  |  102 +-
 src/lib/libcrypto/ec/ec_lcl.h                  |   55 +-
 src/lib/libcrypto/ec/ec_lib.c                  |   80 +-
 src/lib/libcrypto/ec/ec_oct.c                  |  199 ++
 src/lib/libcrypto/ec/ec_pmeth.c                |    1 +
 src/lib/libcrypto/ec/eck_prn.c                 |    3 +-
 src/lib/libcrypto/ec/ecp_mont.c                |   14 +-
 src/lib/libcrypto/ec/ecp_nist.c                |   13 +-
 src/lib/libcrypto/ec/ecp_nistp224.c            | 1658 +++++++++++++
 src/lib/libcrypto/ec/ecp_nistp256.c            | 2171 +++++++++++++++++
 src/lib/libcrypto/ec/ecp_nistp521.c            | 2025 ++++++++++++++++
 src/lib/libcrypto/ec/ecp_nistputil.c           |  197 ++
 src/lib/libcrypto/ec/ecp_oct.c                 |  433 ++++
 src/lib/libcrypto/ec/ecp_smpl.c                |  379 +--
 src/lib/libcrypto/ecdh/ecdh.h                  |    2 +
 src/lib/libcrypto/ecdh/ech_err.c               |    4 +-
 src/lib/libcrypto/ecdh/ech_lib.c               |   20 +
 src/lib/libcrypto/ecdh/ech_locl.h              |    8 +
 src/lib/libcrypto/ecdsa/ecdsa.h                |    2 +
 src/lib/libcrypto/ecdsa/ecs_err.c              |    4 +-
 src/lib/libcrypto/ecdsa/ecs_lib.c              |   21 +-
 src/lib/libcrypto/ecdsa/ecs_locl.h             |    8 +
 src/lib/libcrypto/ecdsa/ecs_ossl.c             |    5 +-
 src/lib/libcrypto/engine/eng_all.c             |    9 +
 src/lib/libcrypto/engine/eng_fat.c             |    3 +-
 src/lib/libcrypto/engine/engine.h              |    9 +
 src/lib/libcrypto/err/err.c                    |   13 +-
 src/lib/libcrypto/err/err.h                    |    3 +-
 src/lib/libcrypto/err/err_all.c                |    7 +
 src/lib/libcrypto/evp/bio_md.c                 |   11 +-
 src/lib/libcrypto/evp/digest.c                 |   28 +-
 src/lib/libcrypto/evp/e_aes.c                  | 1273 +++++++++-
 src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c    |  406 ++++
 src/lib/libcrypto/evp/e_des3.c                 |    3 +
 src/lib/libcrypto/evp/e_null.c                 |    4 +-
 src/lib/libcrypto/evp/e_rc2.c                  |    3 +-
 src/lib/libcrypto/evp/e_rc4.c                  |    1 +
 src/lib/libcrypto/evp/e_rc4_hmac_md5.c         |  298 +++
 src/lib/libcrypto/evp/evp.h                    |   94 +-
 src/lib/libcrypto/evp/evp_enc.c                |   95 +-
 src/lib/libcrypto/evp/evp_err.c                |   20 +-
 src/lib/libcrypto/evp/evp_key.c                |   27 +-
 src/lib/libcrypto/evp/evp_lib.c                |    4 +
 src/lib/libcrypto/evp/evp_locl.h               |   40 +
 src/lib/libcrypto/evp/evp_pbe.c                |    5 +
 src/lib/libcrypto/evp/m_dss.c                  |    2 +
 src/lib/libcrypto/evp/m_dss1.c                 |    3 +
 src/lib/libcrypto/evp/m_ecdsa.c                |    3 +
 src/lib/libcrypto/evp/m_md4.c                  |    2 +
 src/lib/libcrypto/evp/m_md5.c                  |    1 +
 src/lib/libcrypto/evp/m_ripemd.c               |    1 +
 src/lib/libcrypto/evp/m_sha1.c                 |    5 +
 src/lib/libcrypto/evp/m_wp.c                   |    1 +
 src/lib/libcrypto/evp/names.c                  |    5 +
 src/lib/libcrypto/evp/p5_crpt.c                |   33 +-
 src/lib/libcrypto/evp/p5_crpt2.c               |   89 +-
 src/lib/libcrypto/evp/p_open.c                 |    3 +-
 src/lib/libcrypto/evp/p_seal.c                 |    3 +-
 src/lib/libcrypto/evp/p_sign.c                 |   10 +-
 src/lib/libcrypto/evp/p_verify.c               |   10 +-
 src/lib/libcrypto/evp/pmeth_gn.c               |    5 +-
 src/lib/libcrypto/evp/pmeth_lib.c              |   55 +-
 src/lib/libcrypto/hmac/hm_ameth.c              |    2 +-
 src/lib/libcrypto/hmac/hm_pmeth.c              |   14 +-
 src/lib/libcrypto/hmac/hmac.c                  |   37 +
 src/lib/libcrypto/ia64cpuid.S                  |    2 +-
 src/lib/libcrypto/idea/i_cbc.c                 |  168 ++
 src/lib/libcrypto/idea/i_cfb64.c               |  122 +
 src/lib/libcrypto/idea/i_ecb.c                 |   85 +
 src/lib/libcrypto/idea/i_ofb64.c               |  111 +
 src/lib/libcrypto/idea/i_skey.c                |  164 ++
 src/lib/libcrypto/idea/idea.h                  |    3 +
 src/lib/libcrypto/idea/idea_lcl.h              |  215 ++
 src/lib/libcrypto/md4/md4.h                    |    3 +
 src/lib/libcrypto/md4/md4_dgst.c               |    5 +-
 src/lib/libcrypto/md5/md5.h                    |    3 +
 src/lib/libcrypto/md5/md5_dgst.c               |    3 +-
 src/lib/libcrypto/modes/asm/ghash-alpha.pl     |  451 ++++
 src/lib/libcrypto/modes/asm/ghash-armv4.pl     |  429 ++++
 src/lib/libcrypto/modes/asm/ghash-ia64.pl      |  463 ++++
 src/lib/libcrypto/modes/asm/ghash-parisc.pl    |  730 ++++++
 src/lib/libcrypto/modes/asm/ghash-s390x.pl     |  262 ++
 src/lib/libcrypto/modes/asm/ghash-sparcv9.pl   |  330 +++
 src/lib/libcrypto/modes/asm/ghash-x86.pl       | 1342 +++++++++++
 src/lib/libcrypto/modes/asm/ghash-x86_64.pl    |  805 +++++++
 src/lib/libcrypto/modes/cbc128.c               |   10 +-
 src/lib/libcrypto/modes/ccm128.c               |  441 ++++
 src/lib/libcrypto/modes/cfb128.c               |   11 +-
 src/lib/libcrypto/modes/ctr128.c               |   92 +-
 src/lib/libcrypto/modes/cts128.c               |  226 +-
 src/lib/libcrypto/modes/gcm128.c               | 1757 ++++++++++++++
 src/lib/libcrypto/modes/modes.h                |   76 +
 src/lib/libcrypto/modes/modes_lcl.h            |  131 +
 src/lib/libcrypto/modes/ofb128.c               |   11 +-
 src/lib/libcrypto/modes/xts128.c               |  187 ++
 src/lib/libcrypto/o_init.c                     |   34 +-
 src/lib/libcrypto/objects/obj_mac.num          |   27 +
 src/lib/libcrypto/objects/obj_xref.c           |    9 +-
 src/lib/libcrypto/objects/obj_xref.h           |    2 +
 src/lib/libcrypto/objects/obj_xref.txt         |    4 +
 src/lib/libcrypto/objects/objects.txt          |   41 +-
 src/lib/libcrypto/ocsp/ocsp_lib.c              |    3 +-
 src/lib/libcrypto/opensslv.h                   |    6 +-
 src/lib/libcrypto/ossl_typ.h                   |    2 +
 src/lib/libcrypto/pariscid.pl                  |  224 ++
 src/lib/libcrypto/pem/pvkfmt.c                 |   58 +-
 src/lib/libcrypto/perlasm/ppc-xlate.pl         |   13 +-
 src/lib/libcrypto/perlasm/x86_64-xlate.pl      |  221 +-
 src/lib/libcrypto/perlasm/x86asm.pl            |   55 +-
 src/lib/libcrypto/perlasm/x86gas.pl            |   34 +-
 src/lib/libcrypto/pkcs12/p12_decr.c            |    9 +-
 src/lib/libcrypto/pkcs12/p12_key.c             |   16 +-
 src/lib/libcrypto/pkcs12/p12_kiss.c            |    2 +-
 src/lib/libcrypto/pkcs12/p12_mutl.c            |   12 +-
 src/lib/libcrypto/pkcs7/pk7_doit.c             |  101 +-
 src/lib/libcrypto/pkcs7/pk7_smime.c            |   25 +-
 src/lib/libcrypto/ppccap.c                     |  115 +
 src/lib/libcrypto/ppccpuid.pl                  |   48 +-
 src/lib/libcrypto/rand/rand.h                  |    9 +
 src/lib/libcrypto/rand/rand_err.c              |    6 +-
 src/lib/libcrypto/rand/rand_lib.c              |  119 +
 src/lib/libcrypto/rand/randfile.c              |    2 +-
 src/lib/libcrypto/rc2/rc2.h                    |    4 +-
 src/lib/libcrypto/rc2/rc2_skey.c               |    8 +
 src/lib/libcrypto/rc4/asm/rc4-586.pl           |  162 +-
 src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl    |  631 +++++
 src/lib/libcrypto/rc4/asm/rc4-parisc.pl        |  313 +++
 src/lib/libcrypto/rc4/asm/rc4-s390x.pl         |   47 +-
 src/lib/libcrypto/rc4/asm/rc4-x86_64.pl        |  290 ++-
 src/lib/libcrypto/rc4/rc4.h                    |    1 +
 src/lib/libcrypto/rc4/rc4_skey.c               |   36 +-
 src/lib/libcrypto/ripemd/ripemd.h              |    3 +
 src/lib/libcrypto/ripemd/rmd_dgst.c            |    3 +-
 src/lib/libcrypto/rsa/rsa.h                    |   79 +
 src/lib/libcrypto/rsa/rsa_ameth.c              |  351 ++-
 src/lib/libcrypto/rsa/rsa_asn1.c               |   10 +
 src/lib/libcrypto/rsa/rsa_crpt.c               |  257 ++
 src/lib/libcrypto/rsa/rsa_err.c                |   21 +-
 src/lib/libcrypto/rsa/rsa_gen.c                |   15 +
 src/lib/libcrypto/rsa/rsa_lib.c                |  172 +-
 src/lib/libcrypto/rsa/rsa_oaep.c               |    6 +-
 src/lib/libcrypto/rsa/rsa_pmeth.c              |  154 +-
 src/lib/libcrypto/rsa/rsa_pss.c                |   81 +-
 src/lib/libcrypto/rsa/rsa_sign.c               |   33 +
 src/lib/libcrypto/s390xcap.c                   |   12 +-
 src/lib/libcrypto/s390xcpuid.S                 |   17 +-
 src/lib/libcrypto/sha/asm/sha1-586.pl          | 1107 ++++++++-
 src/lib/libcrypto/sha/asm/sha1-alpha.pl        |  322 +++
 src/lib/libcrypto/sha/asm/sha1-armv4-large.pl  |   38 +-
 src/lib/libcrypto/sha/asm/sha1-ia64.pl         |  192 +-
 src/lib/libcrypto/sha/asm/sha1-mips.pl         |  354 +++
 src/lib/libcrypto/sha/asm/sha1-parisc.pl       |  259 ++
 src/lib/libcrypto/sha/asm/sha1-ppc.pl          |   83 +-
 src/lib/libcrypto/sha/asm/sha1-s390x.pl        |   50 +-
 src/lib/libcrypto/sha/asm/sha1-x86_64.pl       | 1185 +++++++--
 src/lib/libcrypto/sha/asm/sha256-586.pl        |   52 +-
 src/lib/libcrypto/sha/asm/sha256-armv4.pl      |   55 +-
 src/lib/libcrypto/sha/asm/sha512-armv4.pl      |  357 ++-
 src/lib/libcrypto/sha/asm/sha512-mips.pl       |  455 ++++
 src/lib/libcrypto/sha/asm/sha512-parisc.pl     |  791 ++++++
 src/lib/libcrypto/sha/asm/sha512-ppc.pl        |  114 +-
 src/lib/libcrypto/sha/asm/sha512-s390x.pl      |   63 +-
 src/lib/libcrypto/sha/asm/sha512-sparcv9.pl    |    6 +-
 src/lib/libcrypto/sha/asm/sha512-x86_64.pl     |   86 +-
 src/lib/libcrypto/sha/sha.h                    |   14 +
 src/lib/libcrypto/sha/sha1dgst.c               |    1 +
 src/lib/libcrypto/sha/sha256.c                 |    4 +-
 src/lib/libcrypto/sha/sha512.c                 |   54 +-
 src/lib/libcrypto/sha/sha_locl.h               |    6 +-
 src/lib/libcrypto/sparcv9cap.c                 |    4 +-
 src/lib/libcrypto/stack/safestack.h            |  138 +-
 src/lib/libcrypto/ts/ts.h                      |    3 -
 src/lib/libcrypto/ts/ts_rsp_verify.c           |    9 +-
 src/lib/libcrypto/ui/ui.h                      |    2 +-
 src/lib/libcrypto/ui/ui_openssl.c              |    2 +-
 src/lib/libcrypto/whrlpool/whrlpool.h          |    3 +
 src/lib/libcrypto/whrlpool/wp_block.c          |    4 +-
 src/lib/libcrypto/whrlpool/wp_dgst.c           |    3 +-
 src/lib/libcrypto/x509/x509.h                  |   11 +
 src/lib/libcrypto/x509/x509_cmp.c              |   27 +-
 src/lib/libcrypto/x509/x509_lu.c               |    2 +-
 src/lib/libcrypto/x509/x509_vfy.c              |    5 -
 src/lib/libcrypto/x509/x509type.c              |   32 +-
 src/lib/libcrypto/x509/x_all.c                 |   19 +
 src/lib/libcrypto/x509v3/v3_skey.c             |    3 +-
 src/lib/libcrypto/x86_64cpuid.pl               |   87 +-
 src/lib/libcrypto/x86cpuid.pl                  |   78 +-
 src/lib/libssl/bio_ssl.c                       |    2 +
 src/lib/libssl/d1_both.c                       |  178 +-
 src/lib/libssl/d1_clnt.c                       |  194 +-
 src/lib/libssl/d1_enc.c                        |    2 +-
 src/lib/libssl/d1_lib.c                        |   54 +-
 src/lib/libssl/d1_pkt.c                        |  167 +-
 src/lib/libssl/d1_srtp.c                       |  493 ++++
 src/lib/libssl/d1_srvr.c                       |  186 +-
 src/lib/libssl/dtls1.h                         |   18 +-
 src/lib/libssl/s23_clnt.c                      |  111 +-
 src/lib/libssl/s23_srvr.c                      |   52 +-
 src/lib/libssl/s3_both.c                       |   36 +-
 src/lib/libssl/s3_clnt.c                       |  397 ++-
 src/lib/libssl/s3_lib.c                        | 1222 ++++++++--
 src/lib/libssl/s3_pkt.c                        |   77 +-
 src/lib/libssl/s3_srvr.c                       |  546 ++++-
 src/lib/libssl/srtp.h                          |  145 ++
 src/lib/libssl/ssl.h                           |  313 ++-
 src/lib/libssl/ssl2.h                          |    4 +
 src/lib/libssl/ssl3.h                          |   32 +-
 src/lib/libssl/ssl_algs.c                      |    9 +
 src/lib/libssl/ssl_asn1.c                      |   50 +
 src/lib/libssl/ssl_cert.c                      |   21 +-
 src/lib/libssl/ssl_ciph.c                      |  133 +-
 src/lib/libssl/ssl_err.c                       |   36 +
 src/lib/libssl/ssl_lib.c                       |  247 +-
 src/lib/libssl/ssl_locl.h                      |   75 +-
 src/lib/libssl/ssl_sess.c                      |  160 +-
 src/lib/libssl/ssl_txt.c                       |    8 +
 src/lib/libssl/t1_clnt.c                       |   21 +-
 src/lib/libssl/t1_enc.c                        |  309 ++-
 src/lib/libssl/t1_lib.c                        |  941 +++++++-
 src/lib/libssl/t1_meth.c                       |   22 +-
 src/lib/libssl/t1_srvr.c                       |   21 +-
 src/lib/libssl/test/CAss.cnf                   |    2 +-
 src/lib/libssl/test/P1ss.cnf                   |    2 +-
 src/lib/libssl/test/P2ss.cnf                   |    2 +-
 src/lib/libssl/test/Uss.cnf                    |    4 +-
 src/lib/libssl/test/pkits-test.pl              |    9 +
 src/lib/libssl/test/test.cnf                   |    2 +-
 src/lib/libssl/test/testssl                    |   10 +
 src/lib/libssl/tls1.h                          |  209 +-
 359 files changed, 63248 insertions(+), 4583 deletions(-)
 create mode 100644 src/lib/libcrypto/aes/asm/aes-mips.pl
 create mode 100644 src/lib/libcrypto/aes/asm/aes-parisc.pl
 create mode 100644 src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
 create mode 100644 src/lib/libcrypto/aes/asm/aesni-x86.pl
 create mode 100644 src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
 create mode 100644 src/lib/libcrypto/aes/asm/vpaes-x86.pl
 create mode 100644 src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
 create mode 100644 src/lib/libcrypto/arm_arch.h
 create mode 100644 src/lib/libcrypto/armcap.c
 create mode 100644 src/lib/libcrypto/armv4cpuid.S
 create mode 100644 src/lib/libcrypto/bn/asm/armv4-gf2m.pl
 create mode 100644 src/lib/libcrypto/bn/asm/ia64-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/mips-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/mips.pl
 create mode 100644 src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
 create mode 100644 src/lib/libcrypto/bn/asm/parisc-mont.pl
 create mode 100644 src/lib/libcrypto/bn/asm/s390x-gf2m.pl
 create mode 100644 src/lib/libcrypto/bn/asm/x86-gf2m.pl
 create mode 100644 src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
 create mode 100755 src/lib/libcrypto/bn/asm/x86_64-mont5.pl
 create mode 100644 src/lib/libcrypto/cmac/cm_ameth.c
 create mode 100644 src/lib/libcrypto/cmac/cm_pmeth.c
 create mode 100644 src/lib/libcrypto/cmac/cmac.c
 create mode 100644 src/lib/libcrypto/cmac/cmac.h
 create mode 100644 src/lib/libcrypto/cms/cms_pwri.c
 create mode 100644 src/lib/libcrypto/ec/ec2_oct.c
 create mode 100644 src/lib/libcrypto/ec/ec_oct.c
 create mode 100644 src/lib/libcrypto/ec/ecp_nistp224.c
 create mode 100644 src/lib/libcrypto/ec/ecp_nistp256.c
 create mode 100644 src/lib/libcrypto/ec/ecp_nistp521.c
 create mode 100644 src/lib/libcrypto/ec/ecp_nistputil.c
 create mode 100644 src/lib/libcrypto/ec/ecp_oct.c
 create mode 100644 src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c
 create mode 100644 src/lib/libcrypto/evp/e_rc4_hmac_md5.c
 create mode 100644 src/lib/libcrypto/idea/i_cbc.c
 create mode 100644 src/lib/libcrypto/idea/i_cfb64.c
 create mode 100644 src/lib/libcrypto/idea/i_ecb.c
 create mode 100644 src/lib/libcrypto/idea/i_ofb64.c
 create mode 100644 src/lib/libcrypto/idea/i_skey.c
 create mode 100644 src/lib/libcrypto/idea/idea_lcl.h
 create mode 100644 src/lib/libcrypto/modes/asm/ghash-alpha.pl
 create mode 100644 src/lib/libcrypto/modes/asm/ghash-armv4.pl
 create mode 100755 src/lib/libcrypto/modes/asm/ghash-ia64.pl
 create mode 100644 src/lib/libcrypto/modes/asm/ghash-parisc.pl
 create mode 100644 src/lib/libcrypto/modes/asm/ghash-s390x.pl
 create mode 100644 src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
 create mode 100644 src/lib/libcrypto/modes/asm/ghash-x86.pl
 create mode 100644 src/lib/libcrypto/modes/asm/ghash-x86_64.pl
 create mode 100644 src/lib/libcrypto/modes/ccm128.c
 create mode 100644 src/lib/libcrypto/modes/gcm128.c
 create mode 100644 src/lib/libcrypto/modes/modes_lcl.h
 create mode 100644 src/lib/libcrypto/modes/xts128.c
 create mode 100644 src/lib/libcrypto/pariscid.pl
 create mode 100644 src/lib/libcrypto/ppccap.c
 create mode 100644 src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
 create mode 100644 src/lib/libcrypto/rc4/asm/rc4-parisc.pl
 create mode 100644 src/lib/libcrypto/rsa/rsa_crpt.c
 create mode 100644 src/lib/libcrypto/sha/asm/sha1-alpha.pl
 create mode 100644 src/lib/libcrypto/sha/asm/sha1-mips.pl
 create mode 100644 src/lib/libcrypto/sha/asm/sha1-parisc.pl
 create mode 100644 src/lib/libcrypto/sha/asm/sha512-mips.pl
 create mode 100755 src/lib/libcrypto/sha/asm/sha512-parisc.pl
 create mode 100644 src/lib/libssl/d1_srtp.c
 create mode 100644 src/lib/libssl/srtp.h

(limited to 'src/lib')

diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h
index d2c99730fe..031abf01b5 100644
--- a/src/lib/libcrypto/aes/aes.h
+++ b/src/lib/libcrypto/aes/aes.h
@@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	AES_KEY *key);
 
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+	AES_KEY *key);
+
 void AES_encrypt(const unsigned char *in, unsigned char *out,
 	const AES_KEY *key);
 void AES_decrypt(const unsigned char *in, unsigned char *out,
diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c
index a7ec54f4da..8f5210ac70 100644
--- a/src/lib/libcrypto/aes/aes_core.c
+++ b/src/lib/libcrypto/aes/aes_core.c
@@ -625,7 +625,7 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 
 	u32 *rk;
@@ -726,7 +726,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -734,7 +734,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 
@@ -1201,7 +1201,7 @@ static const u32 rcon[] = {
 /**
  * Expand the cipher key into the encryption key schedule.
  */
-int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 			AES_KEY *key) {
 	u32 *rk;
    	int i = 0;
@@ -1301,7 +1301,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 /**
  * Expand the cipher key into the decryption key schedule.
  */
-int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 			 AES_KEY *key) {
 
         u32 *rk;
@@ -1309,7 +1309,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 	u32 temp;
 
 	/* first, start with an encryption schedule */
-	status = AES_set_encrypt_key(userKey, bits, key);
+	status = private_AES_set_encrypt_key(userKey, bits, key);
 	if (status < 0)
 		return status;
 
diff --git a/src/lib/libcrypto/aes/aes_misc.c b/src/lib/libcrypto/aes/aes_misc.c
index 4fead1b4c7..f083488ecb 100644
--- a/src/lib/libcrypto/aes/aes_misc.c
+++ b/src/lib/libcrypto/aes/aes_misc.c
@@ -50,6 +50,7 @@
  */
 
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 #include <openssl/aes.h>
 #include "aes_locl.h"
 
@@ -62,3 +63,23 @@ const char *AES_options(void) {
         return "aes(partial)";
 #endif
 }
+
+/* FIPS wrapper functions to block low level AES calls in FIPS mode */
+
+int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_encrypt_key(userKey, bits, key);
+	}
+
+int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+			AES_KEY *key)
+	{
+#ifdef OPENSSL_FIPS
+	fips_cipher_abort(AES);
+#endif
+	return private_AES_set_decrypt_key(userKey, bits, key);
+	}
diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl
index aab40e6f1c..687ed811be 100644
--- a/src/lib/libcrypto/aes/asm/aes-586.pl
+++ b/src/lib/libcrypto/aes/asm/aes-586.pl
@@ -39,7 +39,7 @@
 # but exhibits up to 10% improvement on other cores.
 #
 # Second version is "monolithic" replacement for aes_core.c, which in
-# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
+# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key.
 # This made it possible to implement little-endian variant of the
 # algorithm without modifying the base C code. Motivating factor for
 # the undertaken effort was that it appeared that in tight IA-32
@@ -2854,12 +2854,12 @@ sub enckey()
     &set_label("exit");
 &function_end("_x86_AES_set_encrypt_key");
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_encrypt_key");
+&function_begin_B("private_AES_set_encrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&ret	();
-&function_end_B("AES_set_encrypt_key");
+&function_end_B("private_AES_set_encrypt_key");
 
 sub deckey()
 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2916,9 +2916,9 @@ sub deckey()
 	&mov	(&DWP(4*$i,$key),$tp1);
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
-&function_begin_B("AES_set_decrypt_key");
+&function_begin_B("private_AES_set_decrypt_key");
 	&call	("_x86_AES_set_encrypt_key");
 	&cmp	("eax",0);
 	&je	(&label("proceed"));
@@ -2974,7 +2974,7 @@ sub deckey()
 	&jb	(&label("permute"));
 
 	&xor	("eax","eax");			# return success
-&function_end("AES_set_decrypt_key");
+&function_end("private_AES_set_decrypt_key");
 &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl
index c51ee1fbf6..86b86c4a0f 100644
--- a/src/lib/libcrypto/aes/asm/aes-armv4.pl
+++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl
@@ -27,6 +27,11 @@
 # Rescheduling for dual-issue pipeline resulted in 12% improvement on
 # Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~21.5 cycles per byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
@@ -46,6 +51,7 @@ $key="r11";
 $rounds="r12";
 
 $code=<<___;
+#include "arm_arch.h"
 .text
 .code	32
 
@@ -166,7 +172,7 @@ AES_encrypt:
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_encrypt-AES_Te	@ Te
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -195,10 +201,33 @@ AES_encrypt:
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_encrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -227,11 +256,15 @@ AES_encrypt:
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _armv4_AES_encrypt,%function
@@ -271,11 +304,11 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$t3,$i3,ror#8
 	and	$i3,lr,$s2
-	eor	$s1,$s1,$t1,ror#24
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te2[s2>>8]
+	eor	$s1,$s1,$t1,ror#24
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te1[s2>>16]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te3[s2>>0]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Te0[s2>>24]
@@ -284,16 +317,16 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$t3,$i3,ror#16
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$s2,$t2,ror#16
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Te3[s3>>0]
+	eor	$s2,$s2,$t2,ror#16
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Te2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Te1[s3>>16]
 	eor	$s0,$s0,$i1,ror#24
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
-	eor	$s1,$s1,$i2,ror#16
 	ldr	$i1,[$key],#16
+	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Te0[s3>>24]
 	eor	$s2,$s2,$i3,ror#8
 	ldr	$t1,[$key,#-12]
 	eor	$s3,$s3,$t3,ror#8
@@ -333,11 +366,11 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s2,lsr#16	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s2
-	eor	$s1,$t1,$s1,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s2>>8]
+	eor	$s1,$t1,$s1,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	mov	$s2,$s2,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s2>>16]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s2>>0]
 	eor	$s0,$i1,$s0,lsl#8
 	ldrb	$s2,[$tbl,$s2,lsl#2]	@ Te4[s2>>24]
@@ -346,15 +379,15 @@ _armv4_AES_encrypt:
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,lsl#8
 	and	$i3,lr,$s3,lsr#16	@ i2
-	eor	$s2,$t2,$s2,lsl#24
 	ldrb	$i1,[$tbl,$i1,lsl#2]	@ Te4[s3>>0]
+	eor	$s2,$t2,$s2,lsl#24
+	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldrb	$i2,[$tbl,$i2,lsl#2]	@ Te4[s3>>8]
 	ldrb	$i3,[$tbl,$i3,lsl#2]	@ Te4[s3>>16]
 	eor	$s0,$i1,$s0,lsl#8
-	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	ldr	$i1,[$key,#0]
+	ldrb	$s3,[$tbl,$s3,lsl#2]	@ Te4[s3>>24]
 	eor	$s1,$s1,$i2,lsl#8
 	ldr	$t1,[$key,#4]
 	eor	$s2,$s2,$i3,lsl#16
@@ -371,10 +404,11 @@ _armv4_AES_encrypt:
 	ldr	pc,[sp],#4		@ pop and return
 .size	_armv4_AES_encrypt,.-_armv4_AES_encrypt
 
-.global AES_set_encrypt_key
-.type   AES_set_encrypt_key,%function
+.global private_AES_set_encrypt_key
+.type   private_AES_set_encrypt_key,%function
 .align	5
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_armv4_AES_set_encrypt_key:
 	sub	r3,pc,#8		@ AES_set_encrypt_key
 	teq	r0,#0
 	moveq	r0,#-1
@@ -392,12 +426,13 @@ AES_set_encrypt_key:
 	bne	.Labrt
 
 .Lok:	stmdb   sp!,{r4-r12,lr}
-	sub	$tbl,r3,#AES_set_encrypt_key-AES_Te-1024	@ Te4
+	sub	$tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024	@ Te4
 
 	mov	$rounds,r0		@ inp
 	mov	lr,r1			@ bits
 	mov	$key,r2			@ key
 
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -430,6 +465,22 @@ AES_set_encrypt_key:
 	orr	$s3,$s3,$t3,lsl#24
 	str	$s2,[$key,#-8]
 	str	$s3,[$key,#-4]
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$key],#16
+	str	$s1,[$key,#-12]
+	str	$s2,[$key,#-8]
+	str	$s3,[$key,#-4]
+#endif
 
 	teq	lr,#128
 	bne	.Lnot128
@@ -466,6 +517,7 @@ AES_set_encrypt_key:
 	b	.Ldone
 
 .Lnot128:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#19]
 	ldrb	$t1,[$rounds,#18]
 	ldrb	$t2,[$rounds,#17]
@@ -482,6 +534,16 @@ AES_set_encrypt_key:
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#16]
+	ldr	$i3,[$rounds,#20]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	teq	lr,#192
 	bne	.Lnot192
@@ -526,6 +588,7 @@ AES_set_encrypt_key:
 	b	.L192_loop
 
 .Lnot192:
+#if __ARM_ARCH__<7
 	ldrb	$i2,[$rounds,#27]
 	ldrb	$t1,[$rounds,#26]
 	ldrb	$t2,[$rounds,#25]
@@ -542,6 +605,16 @@ AES_set_encrypt_key:
 	str	$i2,[$key],#8
 	orr	$i3,$i3,$t3,lsl#24
 	str	$i3,[$key,#-4]
+#else
+	ldr	$i2,[$rounds,#24]
+	ldr	$i3,[$rounds,#28]
+#ifdef __ARMEL__
+	rev	$i2,$i2
+	rev	$i3,$i3
+#endif
+	str	$i2,[$key],#8
+	str	$i3,[$key,#-4]
+#endif
 
 	mov	$rounds,#14
 	str	$rounds,[$key,#240-32]
@@ -606,14 +679,14 @@ AES_set_encrypt_key:
 .Labrt:	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
-.global AES_set_decrypt_key
-.type   AES_set_decrypt_key,%function
+.global private_AES_set_decrypt_key
+.type   private_AES_set_decrypt_key,%function
 .align	5
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	str	lr,[sp,#-4]!            @ push lr
-	bl	AES_set_encrypt_key
+	bl	_armv4_AES_set_encrypt_key
 	teq	r0,#0
 	ldrne	lr,[sp],#4              @ pop lr
 	bne	.Labrt
@@ -692,11 +765,15 @@ $code.=<<___;
 	bne	.Lmix
 
 	mov	r0,#0
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+#endif
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 
 .type	AES_Td,%object
 .align	5
@@ -811,7 +888,7 @@ AES_decrypt:
 	mov	$rounds,r0		@ inp
 	mov	$key,r2
 	sub	$tbl,r3,#AES_decrypt-AES_Td		@ Td
-
+#if __ARM_ARCH__<7
 	ldrb	$s0,[$rounds,#3]	@ load input data in endian-neutral
 	ldrb	$t1,[$rounds,#2]	@ manner...
 	ldrb	$t2,[$rounds,#1]
@@ -840,10 +917,33 @@ AES_decrypt:
 	orr	$s3,$s3,$t1,lsl#8
 	orr	$s3,$s3,$t2,lsl#16
 	orr	$s3,$s3,$t3,lsl#24
-
+#else
+	ldr	$s0,[$rounds,#0]
+	ldr	$s1,[$rounds,#4]
+	ldr	$s2,[$rounds,#8]
+	ldr	$s3,[$rounds,#12]
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+#endif
 	bl	_armv4_AES_decrypt
 
 	ldr	$rounds,[sp],#4		@ pop out
+#if __ARM_ARCH__>=7
+#ifdef __ARMEL__
+	rev	$s0,$s0
+	rev	$s1,$s1
+	rev	$s2,$s2
+	rev	$s3,$s3
+#endif
+	str	$s0,[$rounds,#0]
+	str	$s1,[$rounds,#4]
+	str	$s2,[$rounds,#8]
+	str	$s3,[$rounds,#12]
+#else
 	mov	$t1,$s0,lsr#24		@ write output in endian-neutral
 	mov	$t2,$s0,lsr#16		@ manner...
 	mov	$t3,$s0,lsr#8
@@ -872,11 +972,15 @@ AES_decrypt:
 	strb	$t2,[$rounds,#13]
 	strb	$t3,[$rounds,#14]
 	strb	$s3,[$rounds,#15]
-
+#endif
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia   sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _armv4_AES_decrypt,%function
@@ -916,11 +1020,11 @@ _armv4_AES_decrypt:
 	and	$i2,lr,$s2		@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s2,lsr#16
-	eor	$s1,$s1,$t1,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td2[s2>>8]
+	eor	$s1,$s1,$t1,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	mov	$s2,$s2,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td3[s2>>0]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td1[s2>>16]
 	eor	$s0,$s0,$i1,ror#16
 	ldr	$s2,[$tbl,$s2,lsl#2]	@ Td0[s2>>24]
@@ -929,22 +1033,22 @@ _armv4_AES_decrypt:
 	and	$i2,lr,$s3,lsr#8	@ i1
 	eor	$t3,$i3,$t3,ror#8
 	and	$i3,lr,$s3		@ i2
-	eor	$s2,$s2,$t2,ror#8
 	ldr	$i1,[$tbl,$i1,lsl#2]	@ Td1[s3>>16]
+	eor	$s2,$s2,$t2,ror#8
+	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	mov	$s3,$s3,lsr#24
 
-	ldr	$i2,[$tbl,$i2,lsl#2]	@ Td2[s3>>8]
 	ldr	$i3,[$tbl,$i3,lsl#2]	@ Td3[s3>>0]
 	eor	$s0,$s0,$i1,ror#8
-	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
+	ldr	$i1,[$key],#16
 	eor	$s1,$s1,$i2,ror#16
+	ldr	$s3,[$tbl,$s3,lsl#2]	@ Td0[s3>>24]
 	eor	$s2,$s2,$i3,ror#24
-	ldr	$i1,[$key],#16
-	eor	$s3,$s3,$t3,ror#8
 
 	ldr	$t1,[$key,#-12]
-	ldr	$t2,[$key,#-8]
 	eor	$s0,$s0,$i1
+	ldr	$t2,[$key,#-8]
+	eor	$s3,$s3,$t3,ror#8
 	ldr	$t3,[$key,#-4]
 	and	$i1,lr,$s0,lsr#16
 	eor	$s1,$s1,$t1
@@ -985,11 +1089,11 @@ _armv4_AES_decrypt:
 	and	$i1,lr,$s2,lsr#8	@ i0
 	eor	$t2,$t2,$i2,lsl#8
 	and	$i2,lr,$s2		@ i1
-	eor	$t3,$t3,$i3,lsl#8
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s2>>8]
+	eor	$t3,$t3,$i3,lsl#8
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	and	$i3,lr,$s2,lsr#16
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s2>>0]
 	ldrb	$s2,[$tbl,$s2,lsr#24]	@ Td4[s2>>24]
 	eor	$s0,$s0,$i1,lsl#8
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s2>>16]
@@ -997,11 +1101,11 @@ _armv4_AES_decrypt:
 	and	$i1,lr,$s3,lsr#16	@ i0
 	eor	$s2,$t2,$s2,lsl#16
 	and	$i2,lr,$s3,lsr#8	@ i1
-	eor	$t3,$t3,$i3,lsl#16
 	ldrb	$i1,[$tbl,$i1]		@ Td4[s3>>16]
+	eor	$t3,$t3,$i3,lsl#16
+	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	and	$i3,lr,$s3		@ i2
 
-	ldrb	$i2,[$tbl,$i2]		@ Td4[s3>>8]
 	ldrb	$i3,[$tbl,$i3]		@ Td4[s3>>0]
 	ldrb	$s3,[$tbl,$s3,lsr#24]	@ Td4[s3>>24]
 	eor	$s0,$s0,$i1,lsl#16
diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl
new file mode 100644
index 0000000000..2ce6deffc8
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-mips.pl
@@ -0,0 +1,1611 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for MIPS
+
+# October 2010
+#
+# Code uses 1K[+256B] S-box and on single-issue core [such as R5000]
+# spends ~68 cycles per byte processed with 128-bit key. This is ~16%
+# faster than gcc-generated code, which is not very impressive. But
+# recall that compressed S-box requires extra processing, namely
+# additional rotations. Rotations are implemented with lwl/lwr pairs,
+# which is normally used for loading unaligned data. Another cool
+# thing about this module is its endian neutrality, which means that
+# it processes data without ever changing byte order...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$PTR_SLL="dsll";	# incidentally works even on n32
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$PTR_SLL="sll";
+	$SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+{    $big_endian=(unpack('L',pack('N',1))==1);   }
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+my ($MSB,$LSB)=(0,3);	# automatically converted to little-endian
+
+$code.=<<___;
+.text
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+#if !defined(__vxworks) || defined(__pic__)
+.option	pic2
+#endif
+.set	noat
+___
+
+{{{
+my $FRAMESIZE=16*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23));
+my ($key0,$cnt)=($gp,$fp);
+
+# instuction ordering is "stolen" from output from MIPSpro assembler
+# invoked with -mips3 -O3 arguments...
+$code.=<<___;
+.align	5
+.ent	_mips_AES_encrypt
+_mips_AES_encrypt:
+	.frame	$sp,0,$ra
+	.set	reorder
+	lw	$t0,0($key)
+	lw	$t1,4($key)
+	lw	$t2,8($key)
+	lw	$t3,12($key)
+	lw	$cnt,240($key)
+	$PTR_ADD $key0,$key,16
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	sub	$cnt,1
+	_xtr	$i0,$s1,16-2
+.Loop_enc:
+	_xtr	$i1,$s2,16-2
+	_xtr	$i2,$s3,16-2
+	_xtr	$i3,$s0,16-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t0,3($i0)		# Te1[s1>>16]
+	lwl	$t1,3($i1)		# Te1[s2>>16]
+	lwl	$t2,3($i2)		# Te1[s3>>16]
+	lwl	$t3,3($i3)		# Te1[s0>>16]
+	lwr	$t0,2($i0)		# Te1[s1>>16]
+	lwr	$t1,2($i1)		# Te1[s2>>16]
+	lwr	$t2,2($i2)		# Te1[s3>>16]
+	lwr	$t3,2($i3)		# Te1[s0>>16]
+
+	_xtr	$i0,$s2,8-2
+	_xtr	$i1,$s3,8-2
+	_xtr	$i2,$s0,8-2
+	_xtr	$i3,$s1,8-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t4,2($i0)		# Te2[s2>>8]
+	lwl	$t5,2($i1)		# Te2[s3>>8]
+	lwl	$t6,2($i2)		# Te2[s0>>8]
+	lwl	$t7,2($i3)		# Te2[s1>>8]
+	lwr	$t4,1($i0)		# Te2[s2>>8]
+	lwr	$t5,1($i1)		# Te2[s3>>8]
+	lwr	$t6,1($i2)		# Te2[s0>>8]
+	lwr	$t7,1($i3)		# Te2[s1>>8]
+
+	_xtr	$i0,$s3,0-2
+	_xtr	$i1,$s0,0-2
+	_xtr	$i2,$s1,0-2
+	_xtr	$i3,$s2,0-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t8,1($i0)		# Te3[s3]
+	lwl	$t9,1($i1)		# Te3[s0]
+	lwl	$t10,1($i2)		# Te3[s1]
+	lwl	$t11,1($i3)		# Te3[s2]
+	lwr	$t8,0($i0)		# Te3[s3]
+	lwr	$t9,0($i1)		# Te3[s0]
+	lwr	$t10,0($i2)		# Te3[s1]
+	lwr	$t11,0($i3)		# Te3[s2]
+
+	_xtr	$i0,$s0,24-2
+	_xtr	$i1,$s1,24-2
+	_xtr	$i2,$s2,24-2
+	_xtr	$i3,$s3,24-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+	lw	$t4,0($i0)		# Te0[s0>>24]
+	lw	$t5,0($i1)		# Te0[s1>>24]
+	lw	$t6,0($i2)		# Te0[s2>>24]
+	lw	$t7,0($i3)		# Te0[s3>>24]
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	sub	$cnt,1
+	$PTR_ADD $key0,16
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+	.set	noreorder
+	bnez	$cnt,.Loop_enc
+	_xtr	$i0,$s1,16-2
+
+	.set	reorder
+	_xtr	$i1,$s2,16-2
+	_xtr	$i2,$s3,16-2
+	_xtr	$i3,$s0,16-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t0,2($i0)		# Te4[s1>>16]
+	lbu	$t1,2($i1)		# Te4[s2>>16]
+	lbu	$t2,2($i2)		# Te4[s3>>16]
+	lbu	$t3,2($i3)		# Te4[s0>>16]
+
+	_xtr	$i0,$s2,8-2
+	_xtr	$i1,$s3,8-2
+	_xtr	$i2,$s0,8-2
+	_xtr	$i3,$s1,8-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,2($i0)		# Te4[s2>>8]
+	lbu	$t5,2($i1)		# Te4[s3>>8]
+	lbu	$t6,2($i2)		# Te4[s0>>8]
+	lbu	$t7,2($i3)		# Te4[s1>>8]
+
+	_xtr	$i0,$s0,24-2
+	_xtr	$i1,$s1,24-2
+	_xtr	$i2,$s2,24-2
+	_xtr	$i3,$s3,24-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t8,2($i0)		# Te4[s0>>24]
+	lbu	$t9,2($i1)		# Te4[s1>>24]
+	lbu	$t10,2($i2)		# Te4[s2>>24]
+	lbu	$t11,2($i3)		# Te4[s3>>24]
+
+	_xtr	$i0,$s3,0-2
+	_xtr	$i1,$s0,0-2
+	_xtr	$i2,$s1,0-2
+	_xtr	$i3,$s2,0-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+
+	_ins	$t0,16
+	_ins	$t1,16
+	_ins	$t2,16
+	_ins	$t3,16
+
+	_ins	$t4,8
+	_ins	$t5,8
+	_ins	$t6,8
+	_ins	$t7,8
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,2($i0)		# Te4[s3]
+	lbu	$t5,2($i1)		# Te4[s0]
+	lbu	$t6,2($i2)		# Te4[s1]
+	lbu	$t7,2($i3)		# Te4[s2]
+
+	_ins	$t8,24
+	_ins	$t9,24
+	_ins	$t10,24
+	_ins	$t11,24
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	_ins	$t4,0
+	_ins	$t5,0
+	_ins	$t6,0
+	_ins	$t7,0
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	jr	$ra
+.end	_mips_AES_encrypt
+
+.align	5
+.globl	AES_encrypt
+.ent	AES_encrypt
+AES_encrypt:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,AES_encrypt
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Te		# PIC-ified 'load address'
+
+	lwl	$s0,0+$MSB($inp)
+	lwl	$s1,4+$MSB($inp)
+	lwl	$s2,8+$MSB($inp)
+	lwl	$s3,12+$MSB($inp)
+	lwr	$s0,0+$LSB($inp)
+	lwr	$s1,4+$LSB($inp)
+	lwr	$s2,8+$LSB($inp)
+	lwr	$s3,12+$LSB($inp)
+
+	bal	_mips_AES_encrypt
+
+	swr	$s0,0+$LSB($out)
+	swr	$s1,4+$LSB($out)
+	swr	$s2,8+$LSB($out)
+	swr	$s3,12+$LSB($out)
+	swl	$s0,0+$MSB($out)
+	swl	$s1,4+$MSB($out)
+	swl	$s2,8+$MSB($out)
+	swl	$s3,12+$MSB($out)
+
+	.set	noreorder
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	AES_encrypt
+___
+
+$code.=<<___;
+.align	5
+.ent	_mips_AES_decrypt
+_mips_AES_decrypt:
+	.frame	$sp,0,$ra
+	.set	reorder
+	lw	$t0,0($key)
+	lw	$t1,4($key)
+	lw	$t2,8($key)
+	lw	$t3,12($key)
+	lw	$cnt,240($key)
+	$PTR_ADD $key0,$key,16
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	sub	$cnt,1
+	_xtr	$i0,$s3,16-2
+.Loop_dec:
+	_xtr	$i1,$s0,16-2
+	_xtr	$i2,$s1,16-2
+	_xtr	$i3,$s2,16-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t0,3($i0)		# Td1[s3>>16]
+	lwl	$t1,3($i1)		# Td1[s0>>16]
+	lwl	$t2,3($i2)		# Td1[s1>>16]
+	lwl	$t3,3($i3)		# Td1[s2>>16]
+	lwr	$t0,2($i0)		# Td1[s3>>16]
+	lwr	$t1,2($i1)		# Td1[s0>>16]
+	lwr	$t2,2($i2)		# Td1[s1>>16]
+	lwr	$t3,2($i3)		# Td1[s2>>16]
+
+	_xtr	$i0,$s2,8-2
+	_xtr	$i1,$s3,8-2
+	_xtr	$i2,$s0,8-2
+	_xtr	$i3,$s1,8-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t4,2($i0)		# Td2[s2>>8]
+	lwl	$t5,2($i1)		# Td2[s3>>8]
+	lwl	$t6,2($i2)		# Td2[s0>>8]
+	lwl	$t7,2($i3)		# Td2[s1>>8]
+	lwr	$t4,1($i0)		# Td2[s2>>8]
+	lwr	$t5,1($i1)		# Td2[s3>>8]
+	lwr	$t6,1($i2)		# Td2[s0>>8]
+	lwr	$t7,1($i3)		# Td2[s1>>8]
+
+	_xtr	$i0,$s1,0-2
+	_xtr	$i1,$s2,0-2
+	_xtr	$i2,$s3,0-2
+	_xtr	$i3,$s0,0-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lwl	$t8,1($i0)		# Td3[s1]
+	lwl	$t9,1($i1)		# Td3[s2]
+	lwl	$t10,1($i2)		# Td3[s3]
+	lwl	$t11,1($i3)		# Td3[s0]
+	lwr	$t8,0($i0)		# Td3[s1]
+	lwr	$t9,0($i1)		# Td3[s2]
+	lwr	$t10,0($i2)		# Td3[s3]
+	lwr	$t11,0($i3)		# Td3[s0]
+
+	_xtr	$i0,$s0,24-2
+	_xtr	$i1,$s1,24-2
+	_xtr	$i2,$s2,24-2
+	_xtr	$i3,$s3,24-2
+	and	$i0,0x3fc
+	and	$i1,0x3fc
+	and	$i2,0x3fc
+	and	$i3,0x3fc
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+
+	lw	$t4,0($i0)		# Td0[s0>>24]
+	lw	$t5,0($i1)		# Td0[s1>>24]
+	lw	$t6,0($i2)		# Td0[s2>>24]
+	lw	$t7,0($i3)		# Td0[s3>>24]
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	sub	$cnt,1
+	$PTR_ADD $key0,16
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+	.set	noreorder
+	bnez	$cnt,.Loop_dec
+	_xtr	$i0,$s3,16-2
+
+	.set	reorder
+	lw	$t4,1024($Tbl)		# prefetch Td4
+	lw	$t5,1024+32($Tbl)
+	lw	$t6,1024+64($Tbl)
+	lw	$t7,1024+96($Tbl)
+	lw	$t8,1024+128($Tbl)
+	lw	$t9,1024+160($Tbl)
+	lw	$t10,1024+192($Tbl)
+	lw	$t11,1024+224($Tbl)
+
+	_xtr	$i0,$s3,16
+	_xtr	$i1,$s0,16
+	_xtr	$i2,$s1,16
+	_xtr	$i3,$s2,16
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,0xff
+	and	$i3,0xff
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t0,1024($i0)		# Td4[s3>>16]
+	lbu	$t1,1024($i1)		# Td4[s0>>16]
+	lbu	$t2,1024($i2)		# Td4[s1>>16]
+	lbu	$t3,1024($i3)		# Td4[s2>>16]
+
+	_xtr	$i0,$s2,8
+	_xtr	$i1,$s3,8
+	_xtr	$i2,$s0,8
+	_xtr	$i3,$s1,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,0xff
+	and	$i3,0xff
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,1024($i0)		# Td4[s2>>8]
+	lbu	$t5,1024($i1)		# Td4[s3>>8]
+	lbu	$t6,1024($i2)		# Td4[s0>>8]
+	lbu	$t7,1024($i3)		# Td4[s1>>8]
+
+	_xtr	$i0,$s0,24
+	_xtr	$i1,$s1,24
+	_xtr	$i2,$s2,24
+	_xtr	$i3,$s3,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t8,1024($i0)		# Td4[s0>>24]
+	lbu	$t9,1024($i1)		# Td4[s1>>24]
+	lbu	$t10,1024($i2)		# Td4[s2>>24]
+	lbu	$t11,1024($i3)		# Td4[s3>>24]
+
+	_xtr	$i0,$s1,0
+	_xtr	$i1,$s2,0
+	_xtr	$i2,$s3,0
+	_xtr	$i3,$s0,0
+
+	_ins	$t0,16
+	_ins	$t1,16
+	_ins	$t2,16
+	_ins	$t3,16
+
+	_ins	$t4,8
+	_ins	$t5,8
+	_ins	$t6,8
+	_ins	$t7,8
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$t4,1024($i0)		# Td4[s1]
+	lbu	$t5,1024($i1)		# Td4[s2]
+	lbu	$t6,1024($i2)		# Td4[s3]
+	lbu	$t7,1024($i3)		# Td4[s0]
+
+	_ins	$t8,24
+	_ins	$t9,24
+	_ins	$t10,24
+	_ins	$t11,24
+
+	lw	$s0,0($key0)
+	lw	$s1,4($key0)
+	lw	$s2,8($key0)
+	lw	$s3,12($key0)
+
+	_ins	$t4,0
+	_ins	$t5,0
+	_ins	$t6,0
+	_ins	$t7,0
+
+
+	xor	$t0,$t8
+	xor	$t1,$t9
+	xor	$t2,$t10
+	xor	$t3,$t11
+
+	xor	$t0,$t4
+	xor	$t1,$t5
+	xor	$t2,$t6
+	xor	$t3,$t7
+
+	xor	$s0,$t0
+	xor	$s1,$t1
+	xor	$s2,$t2
+	xor	$s3,$t3
+
+	jr	$ra
+.end	_mips_AES_decrypt
+
+.align	5
+.globl	AES_decrypt
+.ent	AES_decrypt
+AES_decrypt:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_S	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_S	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_S	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,AES_decrypt
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Td		# PIC-ified 'load address'
+
+	lwl	$s0,0+$MSB($inp)
+	lwl	$s1,4+$MSB($inp)
+	lwl	$s2,8+$MSB($inp)
+	lwl	$s3,12+$MSB($inp)
+	lwr	$s0,0+$LSB($inp)
+	lwr	$s1,4+$LSB($inp)
+	lwr	$s2,8+$LSB($inp)
+	lwr	$s3,12+$LSB($inp)
+
+	bal	_mips_AES_decrypt
+
+	swr	$s0,0+$LSB($out)
+	swr	$s1,4+$LSB($out)
+	swr	$s2,8+$LSB($out)
+	swr	$s3,12+$LSB($out)
+	swl	$s0,0+$MSB($out)
+	swl	$s1,4+$MSB($out)
+	swl	$s2,8+$MSB($out)
+	swl	$s3,12+$MSB($out)
+
+	.set	noreorder
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	\$15,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	\$14,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	\$13,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	\$12,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	AES_decrypt
+___
+}}}
+
+{{{
+my $FRAMESIZE=8*$SZREG;
+my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
+
+my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
+my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
+my ($rcon,$cnt)=($gp,$fp);
+
+$code.=<<___;
+.align	5
+.ent	_mips_AES_set_encrypt_key
+_mips_AES_set_encrypt_key:
+	.frame	$sp,0,$ra
+	.set	noreorder
+	beqz	$inp,.Lekey_done
+	li	$t0,-1
+	beqz	$key,.Lekey_done
+	$PTR_ADD $rcon,$Tbl,1024+256
+
+	.set	reorder
+	lwl	$rk0,0+$MSB($inp)	# load 128 bits
+	lwl	$rk1,4+$MSB($inp)
+	lwl	$rk2,8+$MSB($inp)
+	lwl	$rk3,12+$MSB($inp)
+	li	$at,128
+	lwr	$rk0,0+$LSB($inp)
+	lwr	$rk1,4+$LSB($inp)
+	lwr	$rk2,8+$LSB($inp)
+	lwr	$rk3,12+$LSB($inp)
+	.set	noreorder
+	beq	$bits,$at,.L128bits
+	li	$cnt,10
+
+	.set	reorder
+	lwl	$rk4,16+$MSB($inp)	# load 192 bits
+	lwl	$rk5,20+$MSB($inp)
+	li	$at,192
+	lwr	$rk4,16+$LSB($inp)
+	lwr	$rk5,20+$LSB($inp)
+	.set	noreorder
+	beq	$bits,$at,.L192bits
+	li	$cnt,8
+
+	.set	reorder
+	lwl	$rk6,24+$MSB($inp)	# load 256 bits
+	lwl	$rk7,28+$MSB($inp)
+	li	$at,256
+	lwr	$rk6,24+$LSB($inp)
+	lwr	$rk7,28+$LSB($inp)
+	.set	noreorder
+	beq	$bits,$at,.L256bits
+	li	$cnt,7
+
+	b	.Lekey_done
+	li	$t0,-2
+
+.align	4
+.L128bits:
+	.set	reorder
+	srl	$i0,$rk3,16
+	srl	$i1,$rk3,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,$rk3,0xff
+	srl	$i3,$rk3,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	sw	$rk3,12($key)
+	sub	$cnt,1
+	$PTR_ADD $key,16
+
+	_bias	$i0,24
+	_bias	$i1,16
+	_bias	$i2,8
+	_bias	$i3,0
+
+	xor	$rk0,$i0
+	lw	$i0,0($rcon)
+	xor	$rk0,$i1
+	xor	$rk0,$i2
+	xor	$rk0,$i3
+	xor	$rk0,$i0
+
+	xor	$rk1,$rk0
+	xor	$rk2,$rk1
+	xor	$rk3,$rk2
+
+	.set	noreorder
+	bnez	$cnt,.L128bits
+	$PTR_ADD $rcon,4
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	li	$cnt,10
+	sw	$rk3,12($key)
+	li	$t0,0
+	sw	$cnt,80($key)
+	b	.Lekey_done
+	$PTR_SUB $key,10*16
+
+.align	4
+.L192bits:
+	.set	reorder
+	srl	$i0,$rk5,16
+	srl	$i1,$rk5,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,$rk5,0xff
+	srl	$i3,$rk5,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	sw	$rk3,12($key)
+	sw	$rk4,16($key)
+	sw	$rk5,20($key)
+	sub	$cnt,1
+	$PTR_ADD $key,24
+
+	_bias	$i0,24
+	_bias	$i1,16
+	_bias	$i2,8
+	_bias	$i3,0
+
+	xor	$rk0,$i0
+	lw	$i0,0($rcon)
+	xor	$rk0,$i1
+	xor	$rk0,$i2
+	xor	$rk0,$i3
+	xor	$rk0,$i0
+
+	xor	$rk1,$rk0
+	xor	$rk2,$rk1
+	xor	$rk3,$rk2
+	xor	$rk4,$rk3
+	xor	$rk5,$rk4
+
+	.set	noreorder
+	bnez	$cnt,.L192bits
+	$PTR_ADD $rcon,4
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	li	$cnt,12
+	sw	$rk3,12($key)
+	li	$t0,0
+	sw	$cnt,48($key)
+	b	.Lekey_done
+	$PTR_SUB $key,12*16
+
+.align	4
+.L256bits:
+	.set	reorder
+	srl	$i0,$rk7,16
+	srl	$i1,$rk7,8
+	and	$i0,0xff
+	and	$i1,0xff
+	and	$i2,$rk7,0xff
+	srl	$i3,$rk7,24
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+
+	sw	$rk0,0($key)
+	sw	$rk1,4($key)
+	sw	$rk2,8($key)
+	sw	$rk3,12($key)
+	sw	$rk4,16($key)
+	sw	$rk5,20($key)
+	sw	$rk6,24($key)
+	sw	$rk7,28($key)
+	sub	$cnt,1
+
+	_bias	$i0,24
+	_bias	$i1,16
+	_bias	$i2,8
+	_bias	$i3,0
+
+	xor	$rk0,$i0
+	lw	$i0,0($rcon)
+	xor	$rk0,$i1
+	xor	$rk0,$i2
+	xor	$rk0,$i3
+	xor	$rk0,$i0
+
+	xor	$rk1,$rk0
+	xor	$rk2,$rk1
+	xor	$rk3,$rk2
+	beqz	$cnt,.L256bits_done
+
+	srl	$i0,$rk3,24
+	srl	$i1,$rk3,16
+	srl	$i2,$rk3,8
+	and	$i3,$rk3,0xff
+	and	$i1,0xff
+	and	$i2,0xff
+	$PTR_ADD $i0,$Tbl
+	$PTR_ADD $i1,$Tbl
+	$PTR_ADD $i2,$Tbl
+	$PTR_ADD $i3,$Tbl
+	lbu	$i0,1024($i0)
+	lbu	$i1,1024($i1)
+	lbu	$i2,1024($i2)
+	lbu	$i3,1024($i3)
+	sll	$i0,24
+	sll	$i1,16
+	sll	$i2,8
+
+	xor	$rk4,$i0
+	xor	$rk4,$i1
+	xor	$rk4,$i2
+	xor	$rk4,$i3
+
+	xor	$rk5,$rk4
+	xor	$rk6,$rk5
+	xor	$rk7,$rk6
+
+	$PTR_ADD $key,32
+	.set	noreorder
+	b	.L256bits
+	$PTR_ADD $rcon,4
+
+.L256bits_done:
+	sw	$rk0,32($key)
+	sw	$rk1,36($key)
+	sw	$rk2,40($key)
+	li	$cnt,14
+	sw	$rk3,44($key)
+	li	$t0,0
+	sw	$cnt,48($key)
+	$PTR_SUB $key,12*16
+
+.Lekey_done:
+	jr	$ra
+	nop
+.end	_mips_AES_set_encrypt_key
+
+.globl	AES_set_encrypt_key
+.ent	AES_set_encrypt_key
+AES_set_encrypt_key:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,AES_set_encrypt_key
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Te		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	.set	noreorder
+	move	$a0,$t0
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	AES_set_encrypt_key
+___
+
+my ($head,$tail)=($inp,$bits);
+my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
+my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
+$code.=<<___;
+.align	5
+.globl	AES_set_decrypt_key
+.ent	AES_set_decrypt_key
+AES_set_decrypt_key:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s2,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s1,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s0,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-7*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Tbl
+	.cpsetup	$pf,$zero,AES_set_decrypt_key
+___
+$code.=<<___;
+	.set	reorder
+	la	$Tbl,AES_Te		# PIC-ified 'load address'
+
+	bal	_mips_AES_set_encrypt_key
+
+	bltz	$t0,.Ldkey_done
+
+	sll	$at,$cnt,4
+	$PTR_ADD $head,$key,0
+	$PTR_ADD $tail,$key,$at
+.align	4
+.Lswap:
+	lw	$rk0,0($head)
+	lw	$rk1,4($head)
+	lw	$rk2,8($head)
+	lw	$rk3,12($head)
+	lw	$rk4,0($tail)
+	lw	$rk5,4($tail)
+	lw	$rk6,8($tail)
+	lw	$rk7,12($tail)
+	sw	$rk0,0($tail)
+	sw	$rk1,4($tail)
+	sw	$rk2,8($tail)
+	sw	$rk3,12($tail)
+	$PTR_ADD $head,16
+	$PTR_SUB $tail,16
+	sw	$rk4,-16($head)
+	sw	$rk5,-12($head)
+	sw	$rk6,-8($head)
+	sw	$rk7,-4($head)
+	bne	$head,$tail,.Lswap
+
+	lw	$tp1,16($key)		# modulo-scheduled
+	lui	$x80808080,0x8080
+	sub	$cnt,1
+	or	$x80808080,0x8080
+	sll	$cnt,2
+	$PTR_ADD $key,16
+	lui	$x1b1b1b1b,0x1b1b
+	nor	$x7f7f7f7f,$zero,$x80808080
+	or	$x1b1b1b1b,0x1b1b
+.align	4
+.Lmix:
+	and	$m,$tp1,$x80808080
+	and	$tp2,$tp1,$x7f7f7f7f
+	srl	$tp4,$m,7
+	addu	$tp2,$tp2		# tp2<<1
+	subu	$m,$tp4
+	and	$m,$x1b1b1b1b
+	xor	$tp2,$m
+
+	and	$m,$tp2,$x80808080
+	and	$tp4,$tp2,$x7f7f7f7f
+	srl	$tp8,$m,7
+	addu	$tp4,$tp4		# tp4<<1
+	subu	$m,$tp8
+	and	$m,$x1b1b1b1b
+	xor	$tp4,$m
+
+	and	$m,$tp4,$x80808080
+	and	$tp8,$tp4,$x7f7f7f7f
+	srl	$tp9,$m,7
+	addu	$tp8,$tp8		# tp8<<1
+	subu	$m,$tp9
+	and	$m,$x1b1b1b1b
+	xor	$tp8,$m
+
+	xor	$tp9,$tp8,$tp1
+	xor	$tpe,$tp8,$tp4
+	xor	$tpb,$tp9,$tp2
+	xor	$tpd,$tp9,$tp4
+
+	_ror	$tp1,$tpd,16
+	 xor	$tpe,$tp2
+	_ror	$tp2,$tpd,-16
+	xor	$tpe,$tp1
+	_ror	$tp1,$tp9,8
+	xor	$tpe,$tp2
+	_ror	$tp2,$tp9,-24
+	xor	$tpe,$tp1
+	_ror	$tp1,$tpb,24
+	xor	$tpe,$tp2
+	_ror	$tp2,$tpb,-8
+	xor	$tpe,$tp1
+	lw	$tp1,4($key)		# modulo-scheduled
+	xor	$tpe,$tp2
+	sub	$cnt,1
+	sw	$tpe,0($key)
+	$PTR_ADD $key,4
+	bnez	$cnt,.Lmix
+
+	li	$t0,0
+.Ldkey_done:
+	.set	noreorder
+	move	$a0,$t0
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	AES_set_decrypt_key
+___
+}}}
+
+######################################################################
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+.rdata
+.align	6
+AES_Te:
+.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84	# Te0
+.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
+.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
+.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
+.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
+.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
+.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
+.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
+.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
+.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
+.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
+.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
+.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
+.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
+.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
+.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
+.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
+.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
+.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
+.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
+.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
+.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
+.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
+.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
+.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
+.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
+.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
+.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
+.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
+.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
+.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
+.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
+.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
+.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
+.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
+.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
+.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
+.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
+.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
+.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
+.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
+.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
+.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
+.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
+.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
+.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
+.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
+.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
+.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
+.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
+.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
+.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
+.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
+.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
+.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
+.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
+.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
+.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
+.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
+.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
+.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
+.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
+.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
+.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
+.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
+.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
+.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
+.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
+.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
+.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
+.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
+.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
+.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
+.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
+.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
+.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
+.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
+.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
+.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
+.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
+.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
+.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
+.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
+.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
+.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
+.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
+.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
+.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
+.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
+.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
+.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
+.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
+.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
+.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
+.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
+.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
+.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
+.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
+.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
+.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
+.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
+.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
+.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
+.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
+.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
+.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
+.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
+.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
+.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
+.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
+.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
+.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
+.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
+.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
+.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
+.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
+.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
+.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
+.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
+.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
+.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
+.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
+.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
+.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
+.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
+.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
+.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
+.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
+
+.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5	# Te4
+.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+
+.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00	# rcon
+.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
+.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
+.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
+.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
+
+.align	6
+AES_Td:
+.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53	# Td0
+.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
+.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
+.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
+.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
+.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
+.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
+.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
+.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
+.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
+.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
+.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
+.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
+.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
+.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
+.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
+.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
+.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
+.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
+.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
+.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
+.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
+.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
+.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
+.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
+.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
+.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
+.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
+.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
+.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
+.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
+.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
+.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
+.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
+.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
+.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
+.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
+.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
+.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
+.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
+.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
+.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
+.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
+.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
+.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
+.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
+.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
+.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
+.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
+.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
+.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
+.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
+.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
+.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
+.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
+.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
+.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
+.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
+.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
+.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
+.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
+.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
+.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
+.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
+.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
+.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
+.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
+.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
+.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
+.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
+.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
+.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
+.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
+.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
+.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
+.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
+.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
+.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
+.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
+.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
+.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
+.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
+.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
+.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
+.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
+.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
+.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
+.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
+.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
+.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
+.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
+.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
+.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
+.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
+.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
+.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
+.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
+.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
+.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
+.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
+.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
+.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
+.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
+.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
+.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
+.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
+.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
+.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
+.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
+.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
+.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
+.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
+.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
+.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
+.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
+.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
+.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
+.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
+.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
+.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
+.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
+.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
+.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
+.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
+.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
+.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
+.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
+.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
+
+.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38	# Td4
+.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	# made-up _instructions, _xtr, _ins, _ror and _bias, cope
+	# with byte order dependencies...
+	if (/^\s+_/) {
+	    s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/;
+
+	    s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/
+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("24-$3"))/e or
+	    s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("24-$3"))/e or
+	    s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/
+		sprintf("srl\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("$3*-1"))/e or
+	    s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/
+		sprintf("sll\t$1,$2,%d",$big_endian ?	eval($3)
+					:		eval("($3-16)&31"))/e;
+
+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/
+		sprintf("sll\t$1,$2,$3")/e				or
+	    s/srl\s+(\$[0-9]+),(\$[0-9]+),0/
+		sprintf("and\t$1,$2,0xff")/e				or
+	    s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/;
+	}
+
+	# convert lwl/lwr and swr/swl to little-endian order
+	if (!$big_endian && /^\s+[sl]w[lr]\s+/) {
+	    s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/
+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e	or
+	    s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/
+		sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e;
+	}
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl
new file mode 100644
index 0000000000..c36b6a2270
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aes-parisc.pl
@@ -0,0 +1,1021 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# AES for PA-RISC.
+#
+# June 2009.
+#
+# The module is mechanical transliteration of aes-sparcv9.pl, but with
+# a twist: S-boxes are compressed even further down to 1K+256B. On
+# PA-7100LC performance is ~40% better than gcc 3.2 generated code and
+# is about 33 cycles per byte processed with 128-bit key. Newer CPUs
+# perform at 16 cycles per byte. It's not faster than code generated
+# by vendor compiler, but recall that it has compressed S-boxes, which
+# requires extra processing.
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$inp="%r26";	# arg0
+$out="%r25";	# arg1
+$key="%r24";	# arg2
+
+($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4");
+($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8");
+
+($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7,
+ $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) =
+("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16",
+"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26");
+
+$tbl="%r28";
+$rounds="%r29";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+AES_encrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$enc_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Te-L\$enc_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$enc_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$enc_inp_aligned
+	bl	_parisc_AES_encrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$enc_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$enc_done
+	stb	$s3,15($out)
+
+L\$enc_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$enc_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_encrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	12($key),$t3
+	_srm	$s0,24,$acc0
+	xor	$t1,$s1,$s1
+	ldw	16($key),$t0
+	_srm	$s1,16,$acc1
+	xor	$t2,$s2,$s2
+	ldw	20($key),$t1
+	xor	$t3,$s3,$s3
+	ldw	24($key),$t2
+	ldw	28($key),$t3
+L\$enc_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$enc_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t3,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t2,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t0,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t3,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t1,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t0,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t2,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch te4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch te4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch te4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch te4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch te4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch te4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch te4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch te4
+	_srm	$s1,16,$acc1
+		xor	$acc14,$s3,$s3
+	b	L\$enc_loop
+		xor	$acc15,$s3,$s3
+
+	.ALIGN	16
+L\$enc_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+	_srm	$t1,16,$acc1
+		xor	$acc15,$t3,$t3
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t2,16,$acc5
+	_srm	$t3,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t3,16,$acc9
+	_srm	$t0,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t0,16,$acc13
+	_srm	$t1,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t2,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Te
+	.WORD	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
+	.WORD	0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554
+	.WORD	0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d
+	.WORD	0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a
+	.WORD	0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87
+	.WORD	0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b
+	.WORD	0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea
+	.WORD	0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b
+	.WORD	0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a
+	.WORD	0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f
+	.WORD	0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108
+	.WORD	0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f
+	.WORD	0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e
+	.WORD	0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5
+	.WORD	0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d
+	.WORD	0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f
+	.WORD	0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e
+	.WORD	0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb
+	.WORD	0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce
+	.WORD	0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497
+	.WORD	0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c
+	.WORD	0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed
+	.WORD	0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b
+	.WORD	0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a
+	.WORD	0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16
+	.WORD	0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594
+	.WORD	0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81
+	.WORD	0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3
+	.WORD	0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a
+	.WORD	0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504
+	.WORD	0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163
+	.WORD	0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d
+	.WORD	0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f
+	.WORD	0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739
+	.WORD	0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47
+	.WORD	0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395
+	.WORD	0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f
+	.WORD	0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883
+	.WORD	0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c
+	.WORD	0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76
+	.WORD	0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e
+	.WORD	0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4
+	.WORD	0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6
+	.WORD	0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b
+	.WORD	0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7
+	.WORD	0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0
+	.WORD	0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25
+	.WORD	0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818
+	.WORD	0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72
+	.WORD	0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651
+	.WORD	0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21
+	.WORD	0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85
+	.WORD	0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa
+	.WORD	0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12
+	.WORD	0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0
+	.WORD	0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9
+	.WORD	0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133
+	.WORD	0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7
+	.WORD	0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920
+	.WORD	0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a
+	.WORD	0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17
+	.WORD	0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8
+	.WORD	0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11
+	.WORD	0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a
+	.BYTE	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+	.BYTE	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+	.BYTE	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+	.BYTE	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+	.BYTE	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+	.BYTE	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+	.BYTE	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+	.BYTE	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+	.BYTE	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+	.BYTE	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+	.BYTE	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+	.BYTE	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+	.BYTE	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+	.BYTE	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+	.BYTE	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+	.BYTE	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+	.BYTE	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+	.BYTE	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+	.BYTE	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+	.BYTE	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+	.BYTE	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+	.BYTE	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+	.BYTE	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+	.BYTE	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+	.BYTE	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+	.BYTE	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+	.BYTE	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+	.BYTE	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+	.BYTE	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+	.BYTE	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+	.BYTE	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+	.BYTE	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+___
+
+$code.=<<___;
+	.EXPORT	AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	16
+AES_decrypt
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	blr	%r0,$tbl
+	ldi	3,$t0
+L\$dec_pic
+	andcm	$tbl,$t0,$tbl
+	ldo	L\$AES_Td-L\$dec_pic($tbl),$tbl
+
+	and	$inp,$t0,$t0
+	sub	$inp,$t0,$inp
+	ldw	0($inp),$s0
+	ldw	4($inp),$s1
+	ldw	8($inp),$s2
+	comib,=	0,$t0,L\$dec_inp_aligned
+	ldw	12($inp),$s3
+
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11
+	ldw	16($inp),$t1
+	vshd	$s0,$s1,$s0
+	vshd	$s1,$s2,$s1
+	vshd	$s2,$s3,$s2
+	vshd	$s3,$t1,$s3
+
+L\$dec_inp_aligned
+	bl	_parisc_AES_decrypt,%r31
+	nop
+
+	extru,<> $out,31,2,%r0
+	b	L\$dec_out_aligned
+	nop
+
+	_srm	$s0,24,$acc0
+	_srm	$s0,16,$acc1
+	stb	$acc0,0($out)
+	_srm	$s0,8,$acc2
+	stb	$acc1,1($out)
+	_srm	$s1,24,$acc4
+	stb	$acc2,2($out)
+	_srm	$s1,16,$acc5
+	stb	$s0,3($out)
+	_srm	$s1,8,$acc6
+	stb	$acc4,4($out)
+	_srm	$s2,24,$acc0
+	stb	$acc5,5($out)
+	_srm	$s2,16,$acc1
+	stb	$acc6,6($out)
+	_srm	$s2,8,$acc2
+	stb	$s1,7($out)
+	_srm	$s3,24,$acc4
+	stb	$acc0,8($out)
+	_srm	$s3,16,$acc5
+	stb	$acc1,9($out)
+	_srm	$s3,8,$acc6
+	stb	$acc2,10($out)
+	stb	$s2,11($out)
+	stb	$acc4,12($out)
+	stb	$acc5,13($out)
+	stb	$acc6,14($out)
+	b	L\$dec_done
+	stb	$s3,15($out)
+
+L\$dec_out_aligned
+	stw	$s0,0($out)
+	stw	$s1,4($out)
+	stw	$s2,8($out)
+	stw	$s3,12($out)
+
+L\$dec_done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	16
+_parisc_AES_decrypt
+	.PROC
+	.CALLINFO	MILLICODE
+	.ENTRY
+	ldw	240($key),$rounds
+	ldw	0($key),$t0
+	ldw	4($key),$t1
+	ldw	8($key),$t2
+	ldw	12($key),$t3
+	_srm	$rounds,1,$rounds
+	xor	$t0,$s0,$s0
+	ldw	16($key),$t0
+	xor	$t1,$s1,$s1
+	ldw	20($key),$t1
+	_srm	$s0,24,$acc0
+	xor	$t2,$s2,$s2
+	ldw	24($key),$t2
+	xor	$t3,$s3,$s3
+	ldw	28($key),$t3
+	_srm	$s3,16,$acc1
+L\$dec_loop
+	_srm	$s2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$s1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$s1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$s0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$s3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$s2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$s2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$s1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$s0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$s3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$s3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$s2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$s1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$s0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+	ldwx,s	$acc14($tbl),$acc14
+	ldwx,s	$acc15($tbl),$acc15
+	addib,= -1,$rounds,L\$dec_last
+	ldo	32($key),$key
+
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldwx,s	$acc0($tbl),$acc0
+	_srm	$t1,0,$acc3
+	ldwx,s	$acc1($tbl),$acc1
+	_srm	$t1,24,$acc4
+	ldwx,s	$acc2($tbl),$acc2
+	_srm	$t0,16,$acc5
+	ldwx,s	$acc3($tbl),$acc3
+	_srm	$t3,8,$acc6
+	ldwx,s	$acc4($tbl),$acc4
+	_srm	$t2,0,$acc7
+	ldwx,s	$acc5($tbl),$acc5
+	_srm	$t2,24,$acc8
+	ldwx,s	$acc6($tbl),$acc6
+	_srm	$t1,16,$acc9
+	ldwx,s	$acc7($tbl),$acc7
+	_srm	$t0,8,$acc10
+	ldwx,s	$acc8($tbl),$acc8
+	_srm	$t3,0,$acc11
+	ldwx,s	$acc9($tbl),$acc9
+	_srm	$t3,24,$acc12
+	ldwx,s	$acc10($tbl),$acc10
+	_srm	$t2,16,$acc13
+	ldwx,s	$acc11($tbl),$acc11
+	_srm	$t1,8,$acc14
+	ldwx,s	$acc12($tbl),$acc12
+	_srm	$t0,0,$acc15
+	ldwx,s	$acc13($tbl),$acc13
+		_ror	$acc1,8,$acc1
+	ldwx,s	$acc14($tbl),$acc14
+
+		_ror	$acc2,16,$acc2
+		xor	$acc0,$s0,$s0
+	ldwx,s	$acc15($tbl),$acc15
+		_ror	$acc3,24,$acc3
+		xor	$acc1,$s0,$s0
+	ldw	16($key),$t0
+		_ror	$acc5,8,$acc5
+		xor	$acc2,$s0,$s0
+	ldw	20($key),$t1
+		_ror	$acc6,16,$acc6
+		xor	$acc3,$s0,$s0
+	ldw	24($key),$t2
+		_ror	$acc7,24,$acc7
+		xor	$acc4,$s1,$s1
+	ldw	28($key),$t3
+		_ror	$acc9,8,$acc9
+		xor	$acc5,$s1,$s1
+	ldw	1024+0($tbl),%r0		; prefetch td4
+		_ror	$acc10,16,$acc10
+		xor	$acc6,$s1,$s1
+	ldw	1024+32($tbl),%r0		; prefetch td4
+		_ror	$acc11,24,$acc11
+		xor	$acc7,$s1,$s1
+	ldw	1024+64($tbl),%r0		; prefetch td4
+		_ror	$acc13,8,$acc13
+		xor	$acc8,$s2,$s2
+	ldw	1024+96($tbl),%r0		; prefetch td4
+		_ror	$acc14,16,$acc14
+		xor	$acc9,$s2,$s2
+	ldw	1024+128($tbl),%r0		; prefetch td4
+		_ror	$acc15,24,$acc15
+		xor	$acc10,$s2,$s2
+	ldw	1024+160($tbl),%r0		; prefetch td4
+	_srm	$s0,24,$acc0
+		xor	$acc11,$s2,$s2
+	ldw	1024+192($tbl),%r0		; prefetch td4
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$s3,$s3
+	ldw	1024+224($tbl),%r0		; prefetch td4
+		xor	$acc14,$s3,$s3
+		xor	$acc15,$s3,$s3
+	b	L\$dec_loop
+	_srm	$s3,16,$acc1
+
+	.ALIGN	16
+L\$dec_last
+	ldo	1024($tbl),$rounds
+		_ror	$acc1,8,$acc1
+		xor	$acc0,$t0,$t0
+	ldw	0($key),$s0
+		_ror	$acc2,16,$acc2
+		xor	$acc1,$t0,$t0
+	ldw	4($key),$s1
+		_ror	$acc3,24,$acc3
+		xor	$acc2,$t0,$t0
+	ldw	8($key),$s2
+		_ror	$acc5,8,$acc5
+		xor	$acc3,$t0,$t0
+	ldw	12($key),$s3
+		_ror	$acc6,16,$acc6
+		xor	$acc4,$t1,$t1
+		_ror	$acc7,24,$acc7
+		xor	$acc5,$t1,$t1
+		_ror	$acc9,8,$acc9
+		xor	$acc6,$t1,$t1
+		_ror	$acc10,16,$acc10
+		xor	$acc7,$t1,$t1
+		_ror	$acc11,24,$acc11
+		xor	$acc8,$t2,$t2
+		_ror	$acc13,8,$acc13
+		xor	$acc9,$t2,$t2
+		_ror	$acc14,16,$acc14
+		xor	$acc10,$t2,$t2
+		_ror	$acc15,24,$acc15
+		xor	$acc11,$t2,$t2
+		xor	$acc12,$acc14,$acc14
+		xor	$acc13,$t3,$t3
+	_srm	$t0,24,$acc0
+		xor	$acc14,$t3,$t3
+		xor	$acc15,$t3,$t3
+	_srm	$t3,16,$acc1
+
+	_srm	$t2,8,$acc2
+	ldbx	$acc0($rounds),$acc0
+	_srm	$t1,24,$acc4
+	ldbx	$acc1($rounds),$acc1
+	_srm	$t0,16,$acc5
+	_srm	$t1,0,$acc3
+	ldbx	$acc2($rounds),$acc2
+	ldbx	$acc3($rounds),$acc3
+	_srm	$t3,8,$acc6
+	ldbx	$acc4($rounds),$acc4
+	_srm	$t2,24,$acc8
+	ldbx	$acc5($rounds),$acc5
+	_srm	$t1,16,$acc9
+	_srm	$t2,0,$acc7
+	ldbx	$acc6($rounds),$acc6
+	ldbx	$acc7($rounds),$acc7
+	_srm	$t0,8,$acc10
+	ldbx	$acc8($rounds),$acc8
+	_srm	$t3,24,$acc12
+	ldbx	$acc9($rounds),$acc9
+	_srm	$t2,16,$acc13
+	_srm	$t3,0,$acc11
+	ldbx	$acc10($rounds),$acc10
+	_srm	$t1,8,$acc14
+	ldbx	$acc11($rounds),$acc11
+	ldbx	$acc12($rounds),$acc12
+	ldbx	$acc13($rounds),$acc13
+	_srm	$t0,0,$acc15
+	ldbx	$acc14($rounds),$acc14
+
+		dep	$acc0,7,8,$acc3
+	ldbx	$acc15($rounds),$acc15
+		dep	$acc4,7,8,$acc7
+		dep	$acc1,15,8,$acc3
+		dep	$acc5,15,8,$acc7
+		dep	$acc2,23,8,$acc3
+		dep	$acc6,23,8,$acc7
+		xor	$acc3,$s0,$s0
+		xor	$acc7,$s1,$s1
+		dep	$acc8,7,8,$acc11
+		dep	$acc12,7,8,$acc15
+		dep	$acc9,15,8,$acc11
+		dep	$acc13,15,8,$acc15
+		dep	$acc10,23,8,$acc11
+		dep	$acc14,23,8,$acc15
+		xor	$acc11,$s2,$s2
+
+	bv	(%r31)
+	.EXIT
+		xor	$acc15,$s3,$s3
+	.PROCEND
+
+	.ALIGN	64
+L\$AES_Td
+	.WORD	0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
+	.WORD	0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393
+	.WORD	0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25
+	.WORD	0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f
+	.WORD	0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1
+	.WORD	0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6
+	.WORD	0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da
+	.WORD	0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844
+	.WORD	0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd
+	.WORD	0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4
+	.WORD	0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45
+	.WORD	0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94
+	.WORD	0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7
+	.WORD	0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a
+	.WORD	0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5
+	.WORD	0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c
+	.WORD	0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1
+	.WORD	0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a
+	.WORD	0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75
+	.WORD	0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051
+	.WORD	0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46
+	.WORD	0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff
+	.WORD	0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77
+	.WORD	0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb
+	.WORD	0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000
+	.WORD	0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e
+	.WORD	0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927
+	.WORD	0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a
+	.WORD	0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e
+	.WORD	0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16
+	.WORD	0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d
+	.WORD	0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8
+	.WORD	0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd
+	.WORD	0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34
+	.WORD	0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163
+	.WORD	0xd731dcca, 0x42638510, 0x13972240, 0x84c61120
+	.WORD	0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d
+	.WORD	0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0
+	.WORD	0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422
+	.WORD	0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef
+	.WORD	0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36
+	.WORD	0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4
+	.WORD	0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662
+	.WORD	0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5
+	.WORD	0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3
+	.WORD	0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b
+	.WORD	0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8
+	.WORD	0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6
+	.WORD	0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6
+	.WORD	0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0
+	.WORD	0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815
+	.WORD	0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f
+	.WORD	0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df
+	.WORD	0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f
+	.WORD	0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e
+	.WORD	0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713
+	.WORD	0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89
+	.WORD	0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c
+	.WORD	0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf
+	.WORD	0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86
+	.WORD	0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f
+	.WORD	0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541
+	.WORD	0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190
+	.WORD	0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742
+	.BYTE	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+	.BYTE	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+	.BYTE	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+	.BYTE	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+	.BYTE	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+	.BYTE	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+	.BYTE	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+	.BYTE	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+	.BYTE	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+	.BYTE	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+	.BYTE	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+	.BYTE	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+	.BYTE	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+	.BYTE	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+	.BYTE	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+	.BYTE	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+	.BYTE	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+	.BYTE	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+	.BYTE	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+	.BYTE	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+	.BYTE	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+	.BYTE	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+	.BYTE	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+	.BYTE	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+	.BYTE	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+	.BYTE	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+	.BYTE	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+	.BYTE	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+	.BYTE	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+	.BYTE	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+	.BYTE	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+	.BYTE	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+	.STRINGZ "AES for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	# translate made up instructons: _ror, _srm
+	s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/				or
+
+	s/_srm(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2)
+		:            sprintf("extrd,u%s,%d,8,",$1,63-$2)/e;
+
+	s/,\*/,/ if ($SIZE_T==4);
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl
index f82c5e1814..7c52cbe5f9 100644
--- a/src/lib/libcrypto/aes/asm/aes-ppc.pl
+++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl
@@ -7,7 +7,7 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
 #
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
@@ -18,7 +18,7 @@
 
 # February 2010
 #
-# Rescheduling instructions to favour Power6 pipeline gives 10%
+# Rescheduling instructions to favour Power6 pipeline gave 10%
 # performance improvement on the platfrom in question (and marginal
 # improvement even on others). It should be noted that Power6 fails
 # to process byte in 18 cycles, only in 23, because it fails to issue
@@ -33,11 +33,13 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$STU	="stwu";
 	$POP	="lwz";
 	$PUSH	="stw";
@@ -116,15 +118,19 @@ LAES_Te:
 	addi	$Tbl0,$Tbl0,`128-8`
 	mtlr	r0
 	blr
-	.space	`32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 LAES_Td:
 	mflr	r0
 	bcl	20,31,\$+4
 	mflr	$Tbl0	;    vvvvvvvv "distance" between . and 1st data entry
-	addi	$Tbl0,$Tbl0,`128-8-32+2048+256`
+	addi	$Tbl0,$Tbl0,`128-64-8+2048+256`
 	mtlr	r0
 	blr
-	.space	`128-32-24`
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`128-64-9*4`
 ___
 &_data_word(
 	0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -328,10 +334,9 @@ $code.=<<___;
 .globl	.AES_encrypt
 .align	7
 .AES_encrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -352,7 +357,14 @@ $code.=<<___;
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
+
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Lenc_unaligned
 
+Lenc_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -363,8 +375,80 @@ $code.=<<___;
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Lenc_done
+
+Lenc_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Lenc_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Lenc_unaligned_ok
+
+Lenc_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Te
+	bl	Lppc_AES_encrypt_compact
+
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+Lenc_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -388,18 +472,21 @@ $code.=<<___;
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_encrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -413,44 +500,44 @@ Lenc_loop:
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s1,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s2,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s3,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s0,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -466,60 +553,60 @@ Lenc_loop:
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Te4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s1,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s2,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc06,$s3,`32-16`,24,31
+	lwz	$acc11,`2048+96`($Tbl0)
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc00,$Tbl2,$acc00
-	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc01,$Tbl2,$acc01
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwimi	$s3,$acc07,16,8,15
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc08,8,16,23
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc09,8,16,23
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc10,8,16,23
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc11,8,16,23
 	or	$s0,$s0,$acc12
 	or	$s1,$s1,$acc13
@@ -530,29 +617,31 @@ Lenc_loop:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_encrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 	mtctr	$acc00
 .align	4
 Lenc_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s1,`32-16`,24,31
@@ -560,48 +649,48 @@ Lenc_compact_loop:
 	rlwinm	$acc06,$s3,`32-16`,24,31
 	rlwinm	$acc07,$s0,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s3,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s0,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s1,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s2,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -612,12 +701,12 @@ Lenc_compact_loop:
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -633,32 +722,32 @@ Lenc_compact_loop:
 	and	$acc03,$acc03,$mask1b
 	xor	$acc00,$acc00,$acc08	# r2
 	xor	$acc01,$acc01,$acc09
+	 rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
 	xor	$acc02,$acc02,$acc10
+	 rotlwi	$acc13,$s1,16
 	xor	$acc03,$acc03,$acc11
+	 rotlwi	$acc14,$s2,16
 
-	rotlwi	$acc12,$s0,16		# ROTATE(r0,16)
-	rotlwi	$acc13,$s1,16
-	rotlwi	$acc14,$s2,16
-	rotlwi	$acc15,$s3,16
 	xor	$s0,$s0,$acc00		# r0^r2
+	rotlwi	$acc15,$s3,16
 	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$s0,$s0,24		# ROTATE(r2^r0,24)
+	xor	$s2,$s2,$acc02
 	rotrwi	$s1,$s1,24
+	xor	$s3,$s3,$acc03
 	rotrwi	$s2,$s2,24
-	rotrwi	$s3,$s3,24
 	xor	$s0,$s0,$acc00		# ROTATE(r2^r0,24)^r2
+	rotrwi	$s3,$s3,24
 	xor	$s1,$s1,$acc01
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	rotlwi	$acc08,$acc12,8		# ROTATE(r0,24)
-	rotlwi	$acc09,$acc13,8
-	rotlwi	$acc10,$acc14,8
-	rotlwi	$acc11,$acc15,8
 	xor	$s0,$s0,$acc12		#
+	rotlwi	$acc09,$acc13,8
 	xor	$s1,$s1,$acc13
+	rotlwi	$acc10,$acc14,8
 	xor	$s2,$s2,$acc14
+	rotlwi	$acc11,$acc15,8
 	xor	$s3,$s3,$acc15
 	xor	$s0,$s0,$acc08		#
 	xor	$s1,$s1,$acc09
@@ -673,14 +762,15 @@ Lenc_compact_done:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.AES_decrypt
 .align	7
 .AES_decrypt:
-	mflr	r0
 	$STU	$sp,-$FRAME($sp)
+	mflr	r0
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -701,7 +791,14 @@ Lenc_compact_done:
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
+	andi.	$t0,$inp,3
+	andi.	$t1,$out,3
+	or.	$t0,$t0,$t1
+	bne	Ldec_unaligned
+
+Ldec_unaligned_ok:
 	lwz	$s0,0($inp)
 	lwz	$s1,4($inp)
 	lwz	$s2,8($inp)
@@ -712,8 +809,80 @@ Lenc_compact_done:
 	stw	$s1,4($out)
 	stw	$s2,8($out)
 	stw	$s3,12($out)
+	b	Ldec_done
+
+Ldec_unaligned:
+	subfic	$t0,$inp,4096
+	subfic	$t1,$out,4096
+	andi.	$t0,$t0,4096-16
+	beq	Ldec_xpage
+	andi.	$t1,$t1,4096-16
+	bne	Ldec_unaligned_ok
+
+Ldec_xpage:
+	lbz	$acc00,0($inp)
+	lbz	$acc01,1($inp)
+	lbz	$acc02,2($inp)
+	lbz	$s0,3($inp)
+	lbz	$acc04,4($inp)
+	lbz	$acc05,5($inp)
+	lbz	$acc06,6($inp)
+	lbz	$s1,7($inp)
+	lbz	$acc08,8($inp)
+	lbz	$acc09,9($inp)
+	lbz	$acc10,10($inp)
+	insrwi	$s0,$acc00,8,0
+	lbz	$s2,11($inp)
+	insrwi	$s1,$acc04,8,0
+	lbz	$acc12,12($inp)
+	insrwi	$s0,$acc01,8,8
+	lbz	$acc13,13($inp)
+	insrwi	$s1,$acc05,8,8
+	lbz	$acc14,14($inp)
+	insrwi	$s0,$acc02,8,16
+	lbz	$s3,15($inp)
+	insrwi	$s1,$acc06,8,16
+	insrwi	$s2,$acc08,8,0
+	insrwi	$s3,$acc12,8,0
+	insrwi	$s2,$acc09,8,8
+	insrwi	$s3,$acc13,8,8
+	insrwi	$s2,$acc10,8,16
+	insrwi	$s3,$acc14,8,16
+
+	bl	LAES_Td
+	bl	Lppc_AES_decrypt_compact
 
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
+	extrwi	$acc00,$s0,8,0
+	extrwi	$acc01,$s0,8,8
+	stb	$acc00,0($out)
+	extrwi	$acc02,$s0,8,16
+	stb	$acc01,1($out)
+	stb	$acc02,2($out)
+	extrwi	$acc04,$s1,8,0
+	stb	$s0,3($out)
+	extrwi	$acc05,$s1,8,8
+	stb	$acc04,4($out)
+	extrwi	$acc06,$s1,8,16
+	stb	$acc05,5($out)
+	stb	$acc06,6($out)
+	extrwi	$acc08,$s2,8,0
+	stb	$s1,7($out)
+	extrwi	$acc09,$s2,8,8
+	stb	$acc08,8($out)
+	extrwi	$acc10,$s2,8,16
+	stb	$acc09,9($out)
+	stb	$acc10,10($out)
+	extrwi	$acc12,$s3,8,0
+	stb	$s2,11($out)
+	extrwi	$acc13,$s3,8,8
+	stb	$acc12,12($out)
+	extrwi	$acc14,$s3,8,16
+	stb	$acc13,13($out)
+	stb	$acc14,14($out)
+	stb	$s3,15($out)
+
+Ldec_done:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
 	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
 	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -737,18 +906,21 @@ Lenc_compact_done:
 	mtlr	r0
 	addi	$sp,$sp,$FRAME
 	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 
 .align	5
 Lppc_AES_decrypt:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,3
+	lwz	$t0,0($key)
 	addi	$Tbl2,$Tbl0,2
+	lwz	$t1,4($key)
 	addi	$Tbl3,$Tbl0,1
+	lwz	$t2,8($key)
 	addi	$acc00,$acc00,-1
+	lwz	$t3,12($key)
 	addi	$key,$key,16
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
@@ -762,44 +934,44 @@ Ldec_loop:
 	rlwinm	$acc02,$s2,`32-24+3`,21,28
 	rlwinm	$acc03,$s3,`32-24+3`,21,28
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc04,$s3,`32-16+3`,21,28
+	lwz	$t1,4($key)
 	rlwinm	$acc05,$s0,`32-16+3`,21,28
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc06,$s1,`32-16+3`,21,28
+	lwz	$t3,12($key)
 	rlwinm	$acc07,$s2,`32-16+3`,21,28
 	lwzx	$acc00,$Tbl0,$acc00
-	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc08,$s2,`32-8+3`,21,28
+	lwzx	$acc01,$Tbl0,$acc01
 	rlwinm	$acc09,$s3,`32-8+3`,21,28
 	lwzx	$acc02,$Tbl0,$acc02
-	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc10,$s0,`32-8+3`,21,28
+	lwzx	$acc03,$Tbl0,$acc03
 	rlwinm	$acc11,$s1,`32-8+3`,21,28
 	lwzx	$acc04,$Tbl1,$acc04
-	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0+3`,21,28
+	lwzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0+3`,21,28
 	lwzx	$acc06,$Tbl1,$acc06
-	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0+3`,21,28
+	lwzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0+3`,21,28
 	lwzx	$acc08,$Tbl2,$acc08
-	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t0,$t0,$acc00
+	lwzx	$acc09,$Tbl2,$acc09
 	xor	$t1,$t1,$acc01
 	lwzx	$acc10,$Tbl2,$acc10
-	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t2,$t2,$acc02
+	lwzx	$acc11,$Tbl2,$acc11
 	xor	$t3,$t3,$acc03
 	lwzx	$acc12,$Tbl3,$acc12
-	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t0,$t0,$acc04
+	lwzx	$acc13,$Tbl3,$acc13
 	xor	$t1,$t1,$acc05
 	lwzx	$acc14,$Tbl3,$acc14
-	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t2,$t2,$acc06
+	lwzx	$acc15,$Tbl3,$acc15
 	xor	$t3,$t3,$acc07
 	xor	$t0,$t0,$acc08
 	xor	$t1,$t1,$acc09
@@ -815,56 +987,56 @@ Ldec_loop:
 	addi	$Tbl2,$Tbl0,2048
 	nop
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	lwz	$t1,4($key)
 	rlwinm	$acc01,$s1,`32-24`,24,31
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	rlwinm	$acc02,$s2,`32-24`,24,31
+	lwz	$t3,12($key)
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	lwz	$acc08,`2048+0`($Tbl0)	! prefetch Td4
-	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc04,$s3,`32-16`,24,31
+	lwz	$acc09,`2048+32`($Tbl0)
 	rlwinm	$acc05,$s0,`32-16`,24,31
 	lwz	$acc10,`2048+64`($Tbl0)
-	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc00,$Tbl2,$acc00
+	lwz	$acc11,`2048+96`($Tbl0)
 	lbzx	$acc01,$Tbl2,$acc01
 	lwz	$acc12,`2048+128`($Tbl0)
-	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc06,$s1,`32-16`,24,31
+	lwz	$acc13,`2048+160`($Tbl0)
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lwz	$acc14,`2048+192`($Tbl0)
-	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lwz	$acc15,`2048+224`($Tbl0)
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl2,$acc02
-	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl2,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl2,$acc04
-	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl2,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl2,$acc06
-	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl2,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl2,$acc08
-	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl2,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl2,$acc10
-	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl2,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl2,$acc12
-	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl2,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl2,$acc14
-	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl2,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
@@ -879,20 +1051,22 @@ Ldec_loop:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .align	4
 Lppc_AES_decrypt_compact:
 	lwz	$acc00,240($key)
-	lwz	$t0,0($key)
-	lwz	$t1,4($key)
-	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	addi	$Tbl1,$Tbl0,2048
+	lwz	$t0,0($key)
 	lis	$mask80,0x8080
+	lwz	$t1,4($key)
 	lis	$mask1b,0x1b1b
-	addi	$key,$key,16
+	lwz	$t2,8($key)
 	ori	$mask80,$mask80,0x8080
+	lwz	$t3,12($key)
 	ori	$mask1b,$mask1b,0x1b1b
+	addi	$key,$key,16
 ___
 $code.=<<___ if ($SIZE_T==8);
 	insrdi	$mask80,$mask80,32,0
@@ -904,10 +1078,10 @@ $code.=<<___;
 Ldec_compact_loop:
 	xor	$s0,$s0,$t0
 	xor	$s1,$s1,$t1
-	xor	$s2,$s2,$t2
-	xor	$s3,$s3,$t3
 	rlwinm	$acc00,$s0,`32-24`,24,31
+	xor	$s2,$s2,$t2
 	rlwinm	$acc01,$s1,`32-24`,24,31
+	xor	$s3,$s3,$t3
 	rlwinm	$acc02,$s2,`32-24`,24,31
 	rlwinm	$acc03,$s3,`32-24`,24,31
 	rlwinm	$acc04,$s3,`32-16`,24,31
@@ -915,48 +1089,48 @@ Ldec_compact_loop:
 	rlwinm	$acc06,$s1,`32-16`,24,31
 	rlwinm	$acc07,$s2,`32-16`,24,31
 	lbzx	$acc00,$Tbl1,$acc00
-	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc08,$s2,`32-8`,24,31
+	lbzx	$acc01,$Tbl1,$acc01
 	rlwinm	$acc09,$s3,`32-8`,24,31
 	lbzx	$acc02,$Tbl1,$acc02
-	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc10,$s0,`32-8`,24,31
+	lbzx	$acc03,$Tbl1,$acc03
 	rlwinm	$acc11,$s1,`32-8`,24,31
 	lbzx	$acc04,$Tbl1,$acc04
-	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc12,$s1,`0`,24,31
+	lbzx	$acc05,$Tbl1,$acc05
 	rlwinm	$acc13,$s2,`0`,24,31
 	lbzx	$acc06,$Tbl1,$acc06
-	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc14,$s3,`0`,24,31
+	lbzx	$acc07,$Tbl1,$acc07
 	rlwinm	$acc15,$s0,`0`,24,31
 	lbzx	$acc08,$Tbl1,$acc08
-	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s0,$acc00,24,0,7
+	lbzx	$acc09,$Tbl1,$acc09
 	rlwinm	$s1,$acc01,24,0,7
 	lbzx	$acc10,$Tbl1,$acc10
-	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s2,$acc02,24,0,7
+	lbzx	$acc11,$Tbl1,$acc11
 	rlwinm	$s3,$acc03,24,0,7
 	lbzx	$acc12,$Tbl1,$acc12
-	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s0,$acc04,16,8,15
+	lbzx	$acc13,$Tbl1,$acc13
 	rlwimi	$s1,$acc05,16,8,15
 	lbzx	$acc14,$Tbl1,$acc14
-	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s2,$acc06,16,8,15
+	lbzx	$acc15,$Tbl1,$acc15
 	rlwimi	$s3,$acc07,16,8,15
 	rlwimi	$s0,$acc08,8,16,23
 	rlwimi	$s1,$acc09,8,16,23
 	rlwimi	$s2,$acc10,8,16,23
 	rlwimi	$s3,$acc11,8,16,23
 	lwz	$t0,0($key)
-	lwz	$t1,4($key)
 	or	$s0,$s0,$acc12
+	lwz	$t1,4($key)
 	or	$s1,$s1,$acc13
 	lwz	$t2,8($key)
-	lwz	$t3,12($key)
 	or	$s2,$s2,$acc14
+	lwz	$t3,12($key)
 	or	$s3,$s3,$acc15
 
 	addi	$key,$key,16
@@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4);
 	and	$acc02,$s2,$mask80
 	and	$acc03,$s3,$mask80
 	srwi	$acc04,$acc00,7		# r1>>7
-	srwi	$acc05,$acc01,7
-	srwi	$acc06,$acc02,7
-	srwi	$acc07,$acc03,7
 	andc	$acc08,$s0,$mask80	# r0&0x7f7f7f7f
+	srwi	$acc05,$acc01,7
 	andc	$acc09,$s1,$mask80
+	srwi	$acc06,$acc02,7
 	andc	$acc10,$s2,$mask80
+	srwi	$acc07,$acc03,7
 	andc	$acc11,$s3,$mask80
 	sub	$acc00,$acc00,$acc04	# r1-(r1>>7)
 	sub	$acc01,$acc01,$acc05
@@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4);
 	and	$acc06,$acc02,$mask80
 	and	$acc07,$acc03,$mask80
 	srwi	$acc08,$acc04,7		# r1>>7
-	srwi	$acc09,$acc05,7
-	srwi	$acc10,$acc06,7
-	srwi	$acc11,$acc07,7
 	andc	$acc12,$acc00,$mask80	# r2&0x7f7f7f7f
+	srwi	$acc09,$acc05,7
 	andc	$acc13,$acc01,$mask80
+	srwi	$acc10,$acc06,7
 	andc	$acc14,$acc02,$mask80
+	srwi	$acc11,$acc07,7
 	andc	$acc15,$acc03,$mask80
 	sub	$acc04,$acc04,$acc08	# r1-(r1>>7)
 	sub	$acc05,$acc05,$acc09
@@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4);
 
 	and	$acc08,$acc04,$mask80	# r1=r4&0x80808080
 	and	$acc09,$acc05,$mask80
-	and	$acc10,$acc06,$mask80
-	and	$acc11,$acc07,$mask80
 	srwi	$acc12,$acc08,7		# r1>>7
+	and	$acc10,$acc06,$mask80
 	srwi	$acc13,$acc09,7
+	and	$acc11,$acc07,$mask80
 	srwi	$acc14,$acc10,7
-	srwi	$acc15,$acc11,7
 	sub	$acc08,$acc08,$acc12	# r1-(r1>>7)
+	srwi	$acc15,$acc11,7
 	sub	$acc09,$acc09,$acc13
 	sub	$acc10,$acc10,$acc14
 	sub	$acc11,$acc11,$acc15
@@ -1124,10 +1298,10 @@ ___
 $code.=<<___;
 	rotrwi	$s0,$s0,8		# = ROTATE(r0,8)
 	rotrwi	$s1,$s1,8
-	rotrwi	$s2,$s2,8
-	rotrwi	$s3,$s3,8
 	xor	$s0,$s0,$acc00		# ^= r2^r0
+	rotrwi	$s2,$s2,8
 	xor	$s1,$s1,$acc01
+	rotrwi	$s3,$s3,8
 	xor	$s2,$s2,$acc02
 	xor	$s3,$s3,$acc03
 	xor	$acc00,$acc00,$acc08
@@ -1135,32 +1309,32 @@ $code.=<<___;
 	xor	$acc02,$acc02,$acc10
 	xor	$acc03,$acc03,$acc11
 	xor	$s0,$s0,$acc04		# ^= r4^r0
-	xor	$s1,$s1,$acc05
-	xor	$s2,$s2,$acc06
-	xor	$s3,$s3,$acc07
 	rotrwi	$acc00,$acc00,24
+	xor	$s1,$s1,$acc05
 	rotrwi	$acc01,$acc01,24
+	xor	$s2,$s2,$acc06
 	rotrwi	$acc02,$acc02,24
+	xor	$s3,$s3,$acc07
 	rotrwi	$acc03,$acc03,24
 	xor	$acc04,$acc04,$acc08
 	xor	$acc05,$acc05,$acc09
 	xor	$acc06,$acc06,$acc10
 	xor	$acc07,$acc07,$acc11
 	xor	$s0,$s0,$acc08		# ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
-	xor	$s1,$s1,$acc09
-	xor	$s2,$s2,$acc10
-	xor	$s3,$s3,$acc11
 	rotrwi	$acc04,$acc04,16
+	xor	$s1,$s1,$acc09
 	rotrwi	$acc05,$acc05,16
+	xor	$s2,$s2,$acc10
 	rotrwi	$acc06,$acc06,16
+	xor	$s3,$s3,$acc11
 	rotrwi	$acc07,$acc07,16
 	xor	$s0,$s0,$acc00		# ^= ROTATE(r8^r2^r0,24)
-	xor	$s1,$s1,$acc01
-	xor	$s2,$s2,$acc02
-	xor	$s3,$s3,$acc03
 	rotrwi	$acc08,$acc08,8
+	xor	$s1,$s1,$acc01
 	rotrwi	$acc09,$acc09,8
+	xor	$s2,$s2,$acc02
 	rotrwi	$acc10,$acc10,8
+	xor	$s3,$s3,$acc03
 	rotrwi	$acc11,$acc11,8
 	xor	$s0,$s0,$acc04		# ^= ROTATE(r8^r4^r0,16)
 	xor	$s1,$s1,$acc05
@@ -1179,7 +1353,9 @@ Ldec_compact_done:
 	xor	$s2,$s2,$t2
 	xor	$s3,$s3,$t3
 	blr
-.long	0
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
 .asciz	"AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align	7
 ___
diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl
index 7e01889298..445a1e6762 100644
--- a/src/lib/libcrypto/aes/asm/aes-s390x.pl
+++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl
@@ -44,12 +44,57 @@
 # Unlike previous version hardware support detection takes place only
 # at the moment of key schedule setup, which is denoted in key->rounds.
 # This is done, because deferred key setup can't be made MT-safe, not
-# for key lengthes longer than 128 bits.
+# for keys longer than 128 bits.
 #
 # Add AES_cbc_encrypt, which gives incredible performance improvement,
 # it was measured to be ~6.6x. It's less than previously mentioned 8x,
 # because software implementation was optimized.
 
+# May 2010.
+#
+# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x
+# performance improvement over "generic" counter mode routine relying
+# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers
+# to the fact that exact throughput value depends on current stack
+# frame alignment within 4KB page. In worst case you get ~75% of the
+# maximum, but *on average* it would be as much as ~98%. Meaning that
+# worst case is unlike, it's like hitting ravine on plateau.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+# December 2010.
+#
+# Add support for z196 "cipher message with counter" instruction.
+# Note however that it's disengaged, because it was measured to
+# perform ~12% worse than vanilla km-based code...
+
+# February 2011.
+#
+# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes
+# instructions, which deliver ~70% improvement at 8KB block size over
+# vanilla km-based code, 37% - at most like 512-bytes block size.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $softonly=0;	# allow hardware support
 
 $t0="%r0";	$mask="%r0";
@@ -69,6 +114,8 @@ $rounds="%r13";
 $ra="%r14";
 $sp="%r15";
 
+$stdframe=16*$SIZE_T+4*8;
+
 sub _data_word()
 { my $i;
     while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly);
 .Lesoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -220,20 +267,20 @@ $code.=<<___;
 	larl	$tbl,AES_Te
 	bras	$ra,_s390x_AES_encrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_encrypt,.-AES_encrypt
 
 .type   _s390x_AES_encrypt,\@function
 .align	16
 _s390x_AES_encrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -397,7 +444,7 @@ _s390x_AES_encrypt:
 	or	$s2,$i3
 	or	$s3,$t3
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	xr	$s0,$t0
 	xr	$s1,$t2
 	x	$s2,24($key)
@@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly);
 .Ldsoft:
 ___
 $code.=<<___;
-	stmg	%r3,$ra,24($sp)
+	stm${g}	%r3,$ra,3*$SIZE_T($sp)
 
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
@@ -546,20 +593,20 @@ $code.=<<___;
 	larl	$tbl,AES_Td
 	bras	$ra,_s390x_AES_decrypt
 
-	lg	$out,24($sp)
+	l${g}	$out,3*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
 	st	$s3,12($out)
 
-	lmg	%r6,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	br	$ra
 .size	AES_decrypt,.-AES_decrypt
 
 .type   _s390x_AES_decrypt,\@function
 .align	16
 _s390x_AES_decrypt:
-	stg	$ra,152($sp)
+	st${g}	$ra,15*$SIZE_T($sp)
 	x	$s0,0($key)
 	x	$s1,4($key)
 	x	$s2,8($key)
@@ -703,7 +750,7 @@ _s390x_AES_decrypt:
 	nr	$i1,$mask
 	nr	$i2,$mask
 
-	lg	$ra,152($sp)
+	l${g}	$ra,15*$SIZE_T($sp)
 	or	$s1,$t1
 	l	$t0,16($key)
 	l	$t1,20($key)
@@ -732,14 +779,15 @@ ___
 $code.=<<___;
 # void AES_set_encrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
+_s390x_AES_set_encrypt_key:
 	lghi	$t0,0
-	clgr	$inp,$t0
+	cl${g}r	$inp,$t0
 	je	.Lminus1
-	clgr	$key,$t0
+	cl${g}r	$key,$t0
 	je	.Lminus1
 
 	lghi	$t0,128
@@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly);
 	je	1f
 	lg	%r1,24($inp)
 	stg	%r1,24($key)
-1:	st	$bits,236($key)	# save bits
+1:	st	$bits,236($key)	# save bits [for debugging purposes]
+	lgr	$t0,%r5
 	st	%r5,240($key)	# save km code
 	lghi	%r2,0
 	br	%r14
@@ -797,7 +846,7 @@ ___
 $code.=<<___;
 .align	16
 .Lekey_internal:
-	stmg	%r6,%r13,48($sp)	# all non-volatile regs
+	stm${g}	%r4,%r13,4*$SIZE_T($sp)	# all non-volatile regs and $key
 
 	larl	$tbl,AES_Te+2048
 
@@ -857,8 +906,9 @@ $code.=<<___;
 	la	$key,16($key)		# key+=4
 	la	$t3,4($t3)		# i++
 	brct	$rounds,.L128_loop
+	lghi	$t0,10
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -905,8 +955,9 @@ $code.=<<___;
 	st	$s2,32($key)
 	st	$s3,36($key)
 	brct	$rounds,.L192_continue
+	lghi	$t0,12
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -967,8 +1018,9 @@ $code.=<<___;
 	st	$s2,40($key)
 	st	$s3,44($key)
 	brct	$rounds,.L256_continue
+	lghi	$t0,14
 	lghi	%r2,0
-	lmg	%r6,%r13,48($sp)
+	lm${g}	%r4,%r13,4*$SIZE_T($sp)
 	br	$ra
 
 .align	16
@@ -1011,42 +1063,34 @@ $code.=<<___;
 .Lminus1:
 	lghi	%r2,-1
 	br	$ra
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 # void AES_set_decrypt_key(const unsigned char *in, int bits,
 # 		 AES_KEY *key) {
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function
 .align	16
-AES_set_decrypt_key:
-	stg	$key,32($sp)		# I rely on AES_set_encrypt_key to
-	stg	$ra,112($sp)		# save non-volatile registers!
-	bras	$ra,AES_set_encrypt_key
-	lg	$key,32($sp)
-	lg	$ra,112($sp)
+private_AES_set_decrypt_key:
+	#st${g}	$key,4*$SIZE_T($sp)	# I rely on AES_set_encrypt_key to
+	st${g}	$ra,14*$SIZE_T($sp)	# save non-volatile registers and $key!
+	bras	$ra,_s390x_AES_set_encrypt_key
+	#l${g}	$key,4*$SIZE_T($sp)
+	l${g}	$ra,14*$SIZE_T($sp)
 	ltgr	%r2,%r2
 	bnzr	$ra
 ___
 $code.=<<___ if (!$softonly);
-	l	$t0,240($key)
+	#l	$t0,240($key)
 	lhi	$t1,16
 	cr	$t0,$t1
 	jl	.Lgo
 	oill	$t0,0x80	# set "decrypt" bit
 	st	$t0,240($key)
 	br	$ra
-
-.align	16
-.Ldkey_internal:
-	stg	$key,32($sp)
-	stg	$ra,40($sp)
-	bras	$ra,.Lekey_internal
-	lg	$key,32($sp)
-	lg	$ra,40($sp)
 ___
 $code.=<<___;
-
-.Lgo:	llgf	$rounds,240($key)
+.align	16
+.Lgo:	lgr	$rounds,$t0	#llgf	$rounds,240($key)
 	la	$i1,0($key)
 	sllg	$i2,$rounds,4
 	la	$i2,0($i2,$key)
@@ -1123,13 +1167,14 @@ $code.=<<___;
 	la	$key,4($key)
 	brct	$rounds,.Lmix
 
-	lmg	%r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+	lm${g}	%r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
 	lghi	%r2,0
 	br	$ra
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
-#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
+########################################################################
+# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
 #                     size_t length, const AES_KEY *key,
 #                     unsigned char *ivec, const int enc)
 {
@@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly);
 	l	%r0,240($key)	# load kmc code
 	lghi	$key,15		# res=len%16, len-=res;
 	ngr	$key,$len
-	slgr	$len,$key
+	sl${g}r	$len,$key
 	la	%r1,16($sp)	# parameter block - ivec || key
 	jz	.Lkmc_truncated
 	.long	0xb92f0042	# kmc %r4,%r2
@@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly);
 	tmll	%r0,0x80
 	jnz	.Lkmc_truncated_dec
 	lghi	%r1,0
-	stg	%r1,128($sp)
-	stg	%r1,136($sp)
+	stg	%r1,16*$SIZE_T($sp)
+	stg	%r1,16*$SIZE_T+8($sp)
 	bras	%r1,1f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 1:	ex	$key,0(%r1)
 	la	%r1,16($sp)	# restore parameter block
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
 	j	.Lkmc_done
 .align	16
 .Lkmc_truncated_dec:
-	stg	$out,64($sp)
-	la	$out,128($sp)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$out,16*$SIZE_T($sp)
 	lghi	$len,16
 	.long	0xb92f0042	# kmc %r4,%r2
-	lg	$out,64($sp)
+	l${g}	$out,4*$SIZE_T($sp)
 	bras	%r1,2f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 2:	ex	$key,0(%r1)
 	j	.Lkmc_done
 .align	16
 .Lcbc_software:
 ___
 $code.=<<___;
-	stmg	$key,$ra,40($sp)
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
 	lhi	%r0,0
-	cl	%r0,164($sp)
+	cl	%r0,`$stdframe+$SIZE_T-4`($sp)
 	je	.Lcbc_decrypt
 
 	larl	$tbl,AES_Te
@@ -1219,10 +1264,10 @@ $code.=<<___;
 	llgf	$s3,12($ivp)
 
 	lghi	$t0,16
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 .Lcbc_enc_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	x	$s0,0($inp)
 	x	$s1,4($inp)
 	x	$s2,8($inp)
@@ -1231,7 +1276,7 @@ $code.=<<___;
 
 	bras	$ra,_s390x_AES_encrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	st	$s0,0($out)
 	st	$s1,4($out)
 	st	$s2,8($out)
@@ -1240,33 +1285,33 @@ $code.=<<___;
 	la	$inp,16($inp)
 	la	$out,16($out)
 	lghi	$t0,16
-	ltgr	$len,$len
+	lt${g}r	$len,$len
 	jz	.Lcbc_enc_done
-	slgr	$len,$t0
+	sl${g}r	$len,$t0
 	brc	4,.Lcbc_enc_tail	# if borrow
 	j	.Lcbc_enc_loop
 .align	16
 .Lcbc_enc_done:
-	lg	$ivp,48($sp)
+	l${g}	$ivp,6*$SIZE_T($sp)
 	st	$s0,0($ivp)
 	st	$s1,4($ivp)	
 	st	$s2,8($ivp)
 	st	$s3,12($ivp)
 
-	lmg	%r7,$ra,56($sp)
+	lm${g}	%r7,$ra,7*$SIZE_T($sp)
 	br	$ra
 
 .align	16
 .Lcbc_enc_tail:
 	aghi	$len,15
 	lghi	$t0,0
-	stg	$t0,128($sp)
-	stg	$t0,136($sp)
+	stg	$t0,16*$SIZE_T($sp)
+	stg	$t0,16*$SIZE_T+8($sp)
 	bras	$t1,3f
-	mvc	128(1,$sp),0($inp)
+	mvc	16*$SIZE_T(1,$sp),0($inp)
 3:	ex	$len,0($t1)
 	lghi	$len,0
-	la	$inp,128($sp)
+	la	$inp,16*$SIZE_T($sp)
 	j	.Lcbc_enc_loop
 
 .align	16
@@ -1275,10 +1320,10 @@ $code.=<<___;
 
 	lg	$t0,0($ivp)
 	lg	$t1,8($ivp)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 .Lcbc_dec_loop:
-	stmg	$inp,$out,16($sp)
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
 	llgf	$s0,0($inp)
 	llgf	$s1,4($inp)
 	llgf	$s2,8($inp)
@@ -1287,7 +1332,7 @@ $code.=<<___;
 
 	bras	$ra,_s390x_AES_decrypt
 
-	lmg	$inp,$key,16($sp)
+	lm${g}	$inp,$key,2*$SIZE_T($sp)
 	sllg	$s0,$s0,32
 	sllg	$s2,$s2,32
 	lr	$s0,$s1
@@ -1295,15 +1340,15 @@ $code.=<<___;
 
 	lg	$t0,0($inp)
 	lg	$t1,8($inp)
-	xg	$s0,128($sp)
-	xg	$s2,136($sp)
+	xg	$s0,16*$SIZE_T($sp)
+	xg	$s2,16*$SIZE_T+8($sp)
 	lghi	$s1,16
-	slgr	$len,$s1
+	sl${g}r	$len,$s1
 	brc	4,.Lcbc_dec_tail	# if borrow
 	brc	2,.Lcbc_dec_done	# if zero
 	stg	$s0,0($out)
 	stg	$s2,8($out)
-	stmg	$t0,$t1,128($sp)
+	stmg	$t0,$t1,16*$SIZE_T($sp)
 
 	la	$inp,16($inp)
 	la	$out,16($out)
@@ -1313,7 +1358,7 @@ $code.=<<___;
 	stg	$s0,0($out)
 	stg	$s2,8($out)
 .Lcbc_dec_exit:
-	lmg	$ivp,$ra,48($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
 	stmg	$t0,$t1,0($ivp)
 
 	br	$ra
@@ -1321,19 +1366,889 @@ $code.=<<___;
 .align	16
 .Lcbc_dec_tail:
 	aghi	$len,15
-	stg	$s0,128($sp)
-	stg	$s2,136($sp)
+	stg	$s0,16*$SIZE_T($sp)
+	stg	$s2,16*$SIZE_T+8($sp)
 	bras	$s1,4f
-	mvc	0(1,$out),128($sp)
+	mvc	0(1,$out),16*$SIZE_T($sp)
 4:	ex	$len,0($s1)
 	j	.Lcbc_dec_exit
 .size	AES_cbc_encrypt,.-AES_cbc_encrypt
-.comm  OPENSSL_s390xcap_P,8,8
+___
+}
+########################################################################
+# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+#                     size_t blocks, const AES_KEY *key,
+#                     const unsigned char *ivec)
+{
+my $inp="%r2";
+my $out="%r4";	# blocks and out are swapped
+my $len="%r3";
+my $key="%r5";	my $iv0="%r5";
+my $ivp="%r6";
+my $fp ="%r7";
+
+$code.=<<___;
+.globl	AES_ctr32_encrypt
+.type	AES_ctr32_encrypt,\@function
+.align	16
+AES_ctr32_encrypt:
+	xgr	%r3,%r4		# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+	llgfr	$len,$len	# safe in ctr32 subroutine even in 64-bit case
+___
+$code.=<<___ if (!$softonly);
+	l	%r0,240($key)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lctr32_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+	la	%r1,0($key)	# %r1 is permanent copy of $key
+	lg	$iv0,0($ivp)	# load ivec
+	lg	$ivp,8($ivp)
+
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	srlg	$fp,$fp,4	# convert bytes to blocks, minimum 16
+	st${g}	$s2,0($sp)	# back-chain
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_hw_switch	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lctr32_hw_switch:
+___
+$code.=<<___ if (0);	######### kmctr code was measured to be ~12% slower
+	larl	$s0,OPENSSL_s390xcap_P
+	lg	$s0,8($s0)
+	tmhh	$s0,0x0004	# check for message_security-assist-4
+	jz	.Lctr32_km_loop
+
+	llgfr	$s0,%r0
+	lgr	$s1,%r1
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb92d2042	# kmctr %r4,%r2,%r2
+
+	llihh	%r0,0x8000	# check if kmctr supports the function code
+	srlg	%r0,%r0,0($s0)
+	ng	%r0,16($sp)
+	lgr	%r0,$s0
+	lgr	%r1,$s1
+	jz	.Lctr32_km_loop
+
+####### kmctr code
+	algr	$out,$inp	# restore $out
+	lgr	$s1,$len	# $s1 undertakes $len
+	j	.Lctr32_kmctr_loop
+.align	16
+.Lctr32_kmctr_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_kmctr_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_kmctr_prepare
+
+	#la	$inp,0($inp)	# inp
+	sllg	$len,$fp,4	# len
+	#la	$out,0($out)	# out
+	la	$s2,16($sp)	# iv
+	.long	0xb92da042	# kmctr $out,$s2,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+
+	slgr	$s1,$fp
+	brc	1,.Lctr32_kmctr_loop	# not zero, no borrow
+	algr	$fp,$s1
+	lghi	$s1,0
+	brc	4+1,.Lctr32_kmctr_loop	# not zero
+
+	l${g}	$sp,0($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+___
+$code.=<<___;
+.Lctr32_km_loop:
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+.Lctr32_km_prepare:
+	stg	$iv0,0($s2)
+	stg	$ivp,8($s2)
+	la	$s2,16($s2)
+	ahi	$ivp,1		# 32-bit increment, preserves upper half
+	brct	$s3,.Lctr32_km_prepare
+
+	la	$s0,16($sp)	# inp
+	sllg	$s1,$fp,4	# len
+	la	$s2,16($sp)	# out
+	.long	0xb92e00a8	# km %r10,%r8
+	brc	1,.-4		# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	lgr	$s3,$fp
+	slgr	$s2,$inp
+.Lctr32_km_xor:
+	lg	$s0,0($inp)
+	lg	$s1,8($inp)
+	xg	$s0,0($s2,$inp)
+	xg	$s1,8($s2,$inp)
+	stg	$s0,0($out,$inp)
+	stg	$s1,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lctr32_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lctr32_km_loop	# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lctr32_km_loop	# not zero
+
+	l${g}	$s0,0($sp)
+	l${g}	$s1,$SIZE_T($sp)
+	la	$s2,16($sp)
+.Lctr32_km_zap:
+	stg	$s0,0($s2)
+	stg	$s0,8($s2)
+	la	$s2,16($s2)
+	brct	$s1,.Lctr32_km_zap
+
+	la	$sp,0($s0)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lctr32_software:
+___
+$code.=<<___;
+	stm${g}	$key,$ra,5*$SIZE_T($sp)
+	sl${g}r	$inp,$out
+	larl	$tbl,AES_Te
+	llgf	$t1,12($ivp)
+
+.Lctr32_loop:
+	stm${g}	$inp,$out,2*$SIZE_T($sp)
+	llgf	$s0,0($ivp)
+	llgf	$s1,4($ivp)
+	llgf	$s2,8($ivp)
+	lgr	$s3,$t1
+	st	$t1,16*$SIZE_T($sp)
+	lgr	%r4,$key
+
+	bras	$ra,_s390x_AES_encrypt
+
+	lm${g}	$inp,$ivp,2*$SIZE_T($sp)
+	llgf	$t1,16*$SIZE_T($sp)
+	x	$s0,0($inp,$out)
+	x	$s1,4($inp,$out)
+	x	$s2,8($inp,$out)
+	x	$s3,12($inp,$out)
+	stm	$s0,$s3,0($out)
+
+	la	$out,16($out)
+	ahi	$t1,1		# 32-bit increment
+	brct	$len,.Lctr32_loop
+
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_ctr32_encrypt,.-AES_ctr32_encrypt
+___
+}
+
+########################################################################
+# void AES_xts_encrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+{
+my $inp="%r2";
+my $out="%r4";	# len and out are swapped
+my $len="%r3";
+my $key1="%r5";	# $i1
+my $key2="%r6";	# $i2
+my $fp="%r7";	# $i3
+my $tweak=16*$SIZE_T+16;	# or $stdframe-16, bottom of the frame...
+
+$code.=<<___;
+.type	_s390x_xts_km,\@function
+.align	16
+_s390x_xts_km:
+___
+$code.=<<___ if(1);
+	llgfr	$s0,%r0			# put aside the function code
+	lghi	$s1,0x7f
+	nr	$s1,%r0
+	lghi	%r0,0			# query capability vector
+	la	%r1,2*$SIZE_T($sp)
+	.long	0xb92e0042		# km %r4,%r2
+	llihh	%r1,0x8000
+	srlg	%r1,%r1,32($s1)		# check for 32+function code
+	ng	%r1,2*$SIZE_T($sp)
+	lgr	%r0,$s0			# restore the function code
+	la	%r1,0($key1)		# restore $key1
+	jz	.Lxts_km_vanilla
+
+	lmg	$i2,$i3,$tweak($sp)	# put aside the tweak value
+	algr	$out,$inp
+
+	oill	%r0,32			# switch to xts function code
+	aghi	$s1,-18			#
+	sllg	$s1,$s1,3		# (function code - 18)*8, 0 or 16
+	la	%r1,$tweak-16($sp)
+	slgr	%r1,$s1			# parameter block position
+	lmg	$s0,$s3,0($key1)	# load 256 bits of key material,
+	stmg	$s0,$s3,0(%r1)		# and copy it to parameter block.
+					# yes, it contains junk and overlaps
+					# with the tweak in 128-bit case.
+					# it's done to avoid conditional
+					# branch.
+	stmg	$i2,$i3,$tweak($sp)	# "re-seat" the tweak value
+
+	.long	0xb92e0042		# km %r4,%r2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	lrvg	$s0,$tweak+0($sp)	# load the last tweak
+	lrvg	$s1,$tweak+8($sp)
+	stmg	%r0,%r3,$tweak-32(%r1)	# wipe copy of the key
+
+	nill	%r0,0xffdf		# switch back to original function code
+	la	%r1,0($key1)		# restore pointer to $key1
+	slgr	$out,$inp
+
+	llgc	$len,2*$SIZE_T-1($sp)
+	nill	$len,0x0f		# $len%=16
+	br	$ra
+	
+.align	16
+.Lxts_km_vanilla:
+___
+$code.=<<___;
+	# prepare and allocate stack frame at the top of 4K page
+	# with 1K reserved for eventual signal handling
+	lghi	$s0,-1024-256-16# guarantee at least 256-bytes buffer
+	lghi	$s1,-4096
+	algr	$s0,$sp
+	lgr	$fp,$sp
+	ngr	$s0,$s1		# align at page boundary
+	slgr	$fp,$s0		# total buffer size
+	lgr	$s2,$sp
+	lghi	$s1,1024+16	# sl[g]fi is extended-immediate facility
+	slgr	$fp,$s1		# deduct reservation to get usable buffer size
+	# buffer size is at lest 256 and at most 3072+256-16
+
+	la	$sp,1024($s0)	# alloca
+	nill	$fp,0xfff0	# round to 16*n
+	st${g}	$s2,0($sp)	# back-chain
+	nill	$len,0xfff0	# redundant
+	st${g}	$fp,$SIZE_T($sp)
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_go	# not zero, no borrow
+	algr	$fp,$len	# input is shorter than allocated buffer
+	lghi	$len,0
+	st${g}	$fp,$SIZE_T($sp)
+
+.Lxts_km_go:
+	lrvg	$s0,$tweak+0($s2)	# load the tweak value in little-endian
+	lrvg	$s1,$tweak+8($s2)
+
+	la	$s2,16($sp)		# vector of ascending tweak values
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+	j	.Lxts_km_start
+
+.Lxts_km_loop:
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_prepare:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	srlg	$i2,$s0,63		# carry bit from lower half
+	sllg	$s0,$s0,1
+	sllg	$s1,$s1,1
+	xgr	$s0,$i1
+	ogr	$s1,$i2
+.Lxts_km_start:
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	stg	$i1,0($s2,$inp)
+	stg	$i2,8($s2,$inp)
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_prepare
+
+	slgr	$inp,$fp		# rewind $inp
+	la	$s2,0($out,$inp)
+	lgr	$s3,$fp
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# pay attention to "partial completion"
+
+	la	$s2,16($sp)
+	slgr	$s2,$inp
+	srlg	$s3,$fp,4
+.Lxts_km_xor:
+	lg	$i1,0($out,$inp)
+	lg	$i2,8($out,$inp)
+	xg	$i1,0($s2,$inp)
+	xg	$i2,8($s2,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$inp,16($inp)
+	brct	$s3,.Lxts_km_xor
+
+	slgr	$len,$fp
+	brc	1,.Lxts_km_loop		# not zero, no borrow
+	algr	$fp,$len
+	lghi	$len,0
+	brc	4+1,.Lxts_km_loop	# not zero
+
+	l${g}	$i1,0($sp)		# back-chain
+	llgf	$fp,`2*$SIZE_T-4`($sp)	# bytes used
+	la	$i2,16($sp)
+	srlg	$fp,$fp,4
+.Lxts_km_zap:
+	stg	$i1,0($i2)
+	stg	$i1,8($i2)
+	la	$i2,16($i2)
+	brct	$fp,.Lxts_km_zap
+
+	la	$sp,0($i1)
+	llgc	$len,2*$SIZE_T-1($i1)
+	nill	$len,0x0f		# $len%=16
+	bzr	$ra
+
+	# generate one more tweak...
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	srlg	$i2,$s0,63		# carry bit from lower half
+	sllg	$s0,$s0,1
+	sllg	$s1,$s1,1
+	xgr	$s0,$i1
+	ogr	$s1,$i2
+
+	ltr	$len,$len		# clear zero flag
+	br	$ra
+.size	_s390x_xts_km,.-_s390x_xts_km
+
+.globl	AES_xts_encrypt
+.type	AES_xts_encrypt,\@function
+.align	16
+AES_xts_encrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	srag	$len,$len,4		# formally wrong, because it expands
+					# sign byte, but who can afford asking
+					# to process more than 2^63-1 bytes?
+					# I use it, because it sets condition
+					# code...
+	bcr	8,$ra			# abort if zero (i.e. less than 16)
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_enc_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+	st${g}	$ra,14*$SIZE_T($sp)
+
+	sllg	$len,$len,4		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed anymore
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_enc_km_done
+
+	aghi	$inp,-16		# take one step back
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_enc_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_km_steal
+
+	la	$s2,0($i3)
+	lghi	$s3,16
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($s2)
+	xg	$i2,8($s2)
+	stg	$i1,0($s2)
+	stg	$i2,8($s2)
+	.long	0xb92e00aa		# km $s2,$s2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+	xg	$i1,0($i3)
+	xg	$i2,8($i3)
+	stg	$i1,0($i3)
+	stg	$i2,8($i3)
+
+.Lxts_enc_km_done:
+	l${g}	$ra,14*$SIZE_T($sp)
+	st${g}	$sp,$tweak($sp)		# wipe tweak
+	st${g}	$sp,$tweak($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_enc_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	slgr	$out,$inp
+
+	xgr	$s0,$s0			# clear upper half
+	xgr	$s1,$s1
+	lrv	$s0,$stdframe+4($sp)	# load secno
+	lrv	$s1,$stdframe+0($sp)
+	xgr	$s2,$s2
+	xgr	$s3,$s3
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	j	.Lxts_enc_enter
+
+.align	16
+.Lxts_enc_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+	la	$inp,16($inp)		# $inp+=16
+.Lxts_enc_enter:
+	x	$s0,0($inp)		# ^=*($inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	brct${g}	$len,.Lxts_enc_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_enc_done
+
+	la	$i3,0($inp,$out)	# put aside real $out
+.Lxts_enc_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_enc_steal
+	la	$out,0($i3)		# restore real $out
+
+	# generate last tweak...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($out)		# ^=*(inp)|stolen cipther-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_encrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,`$tweak+0`($sp)	# ^=tweak
+	x	$s1,`$tweak+4`($sp)
+	x	$s2,`$tweak+8`($sp)
+	x	$s3,`$tweak+12`($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+
+.Lxts_enc_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_encrypt,.-AES_xts_encrypt
+___
+# void AES_xts_decrypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,u64 secno);
+#
+$code.=<<___;
+.globl	AES_xts_decrypt
+.type	AES_xts_decrypt,\@function
+.align	16
+AES_xts_decrypt:
+	xgr	%r3,%r4			# flip %r3 and %r4, $out and $len
+	xgr	%r4,%r3
+	xgr	%r3,%r4
+___
+$code.=<<___ if ($SIZE_T==4);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	st${g}	$len,1*$SIZE_T($sp)	# save copy of $len
+	aghi	$len,-16
+	bcr	4,$ra			# abort if less than zero. formally
+					# wrong, because $len is unsigned,
+					# but who can afford asking to
+					# process more than 2^63-1 bytes?
+	tmll	$len,0x0f
+	jnz	.Lxts_dec_proceed
+	aghi	$len,16
+.Lxts_dec_proceed:
+___
+$code.=<<___ if (!$softonly);
+	llgf	%r0,240($key2)
+	lhi	%r1,16
+	clr	%r0,%r1
+	jl	.Lxts_dec_software
+
+	stm${g}	%r6,$s3,6*$SIZE_T($sp)
+	st${g}	$ra,14*$SIZE_T($sp)
+
+	nill	$len,0xfff0		# $len&=~15
+	slgr	$out,$inp
+
+	# generate the tweak value
+	l${g}	$s3,$stdframe($sp)	# pointer to iv
+	la	$s2,$tweak($sp)
+	lmg	$s0,$s1,0($s3)
+	lghi	$s3,16
+	stmg	$s0,$s1,0($s2)
+	la	%r1,0($key2)		# $key2 is not needed past this point
+	.long	0xb92e00aa		# km $s2,$s2, generate the tweak
+	brc	1,.-4			# can this happen?
+
+	l	%r0,240($key1)
+	la	%r1,0($key1)		# $key1 is not needed anymore
+
+	ltgr	$len,$len
+	jz	.Lxts_dec_km_short
+	bras	$ra,_s390x_xts_km
+	jz	.Lxts_dec_km_done
+
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+	j	.Lxts_dec_km_2ndtweak
+
+.Lxts_dec_km_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%=16
+	lrvg	$s0,$tweak+0($sp)	# load the tweak
+	lrvg	$s1,$tweak+8($sp)
+	lrvgr	$s2,$s0			# make copy in reverse byte order
+	lrvgr	$s3,$s1
+
+.Lxts_dec_km_2ndtweak:
+	lghi	$i1,0x87
+	srag	$i2,$s1,63		# broadcast upper bit
+	ngr	$i1,$i2			# rem
+	srlg	$i2,$s0,63		# carry bit from lower half
+	sllg	$s0,$s0,1
+	sllg	$s1,$s1,1
+	xgr	$s0,$i1
+	ogr	$s1,$i2
+	lrvgr	$i1,$s0			# flip byte order
+	lrvgr	$i2,$s1
+
+	xg	$i1,0($inp)
+	xg	$i2,8($inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+	la	$i2,0($out,$inp)
+	lghi	$i3,16
+	.long	0xb92e0066		# km $i2,$i2
+	brc	1,.-4			# can this happen?
+	lrvgr	$i1,$s0
+	lrvgr	$i2,$s1
+	xg	$i1,0($out,$inp)
+	xg	$i2,8($out,$inp)
+	stg	$i1,0($out,$inp)
+	stg	$i2,8($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_km_steal:
+	llgc	$i1,16($inp)
+	llgc	$i2,0($out,$inp)
+	stc	$i1,0($out,$inp)
+	stc	$i2,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_km_steal
+
+	lgr	$s0,$s2
+	lgr	$s1,$s3
+	xg	$s0,0($i3)
+	xg	$s1,8($i3)
+	stg	$s0,0($i3)
+	stg	$s1,8($i3)
+	la	$s0,0($i3)
+	lghi	$s1,16
+	.long	0xb92e0088		# km $s0,$s0
+	brc	1,.-4			# can this happen?
+	xg	$s2,0($i3)
+	xg	$s3,8($i3)
+	stg	$s2,0($i3)
+	stg	$s3,8($i3)
+.Lxts_dec_km_done:
+	l${g}	$ra,14*$SIZE_T($sp)
+	st${g}	$sp,$tweak($sp)		# wipe tweak
+	st${g}	$sp,$tweak($sp)
+	lm${g}	%r6,$s3,6*$SIZE_T($sp)
+	br	$ra
+.align	16
+.Lxts_dec_software:
+___
+$code.=<<___;
+	stm${g}	%r6,$ra,6*$SIZE_T($sp)
+
+	srlg	$len,$len,4
+	slgr	$out,$inp
+
+	xgr	$s0,$s0			# clear upper half
+	xgr	$s1,$s1
+	lrv	$s0,$stdframe+4($sp)	# load secno
+	lrv	$s1,$stdframe+0($sp)
+	xgr	$s2,$s2
+	xgr	$s3,$s3
+	stm${g}	%r2,%r5,2*$SIZE_T($sp)
+	la	$key,0($key2)
+	larl	$tbl,AES_Te
+	bras	$ra,_s390x_AES_encrypt	# generate the tweak
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	larl	$tbl,AES_Td
+	lt${g}r	$len,$len
+	stm	$s0,$s3,$tweak($sp)	# save the tweak
+	jz	.Lxts_dec_short
+	j	.Lxts_dec_enter
+
+.align	16
+.Lxts_dec_loop:
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits 
+	stg	$s1,$tweak+0($sp)	# save the tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak+8($sp)
+	llgfr	$s3,$s3
+.Lxts_dec_enter:
+	x	$s0,0($inp)		# tweak^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)	# only two registers are changing
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+	la	$inp,16($inp)
+	brct${g}	$len,.Lxts_dec_loop
+
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	jz	.Lxts_dec_done
+
+	# generate pair of tweaks...
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$i2,$s1			# flip byte order
+	lrvgr	$i3,$s3
+	stmg	$i2,$i3,$tweak($sp)	# save the 1st tweak
+	j	.Lxts_dec_2ndtweak
+
+.align	16
+.Lxts_dec_short:
+	llgc	$len,`2*$SIZE_T-1`($sp)
+	nill	$len,0x0f		# $len%16
+	lrvg	$s1,$tweak+0($sp)	# load the tweak in little-endian
+	lrvg	$s3,$tweak+8($sp)
+.Lxts_dec_2ndtweak:
+	lghi	%r1,0x87
+	srag	%r0,$s3,63		# broadcast upper bit
+	ngr	%r1,%r0			# rem
+	srlg	%r0,$s1,63		# carry bit from lower half
+	sllg	$s1,$s1,1
+	sllg	$s3,$s3,1
+	xgr	$s1,%r1
+	ogr	$s3,%r0
+	lrvgr	$s1,$s1			# flip byte order
+	lrvgr	$s3,$s3
+	srlg	$s0,$s1,32		# smash the tweak to 4x32-bits
+	stg	$s1,$tweak-16+0($sp)	# save the 2nd tweak
+	llgfr	$s1,$s1
+	srlg	$s2,$s3,32
+	stg	$s3,$tweak-16+8($sp)
+	llgfr	$s3,$s3
+
+	x	$s0,0($inp)		# tweak_the_2nd^=*(inp)
+	x	$s1,4($inp)
+	x	$s2,8($inp)
+	x	$s3,12($inp)
+	stm${g}	%r2,%r3,2*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	lm${g}	%r2,%r5,2*$SIZE_T($sp)
+	x	$s0,$tweak-16+0($sp)	# ^=tweak_the_2nd
+	x	$s1,$tweak-16+4($sp)
+	x	$s2,$tweak-16+8($sp)
+	x	$s3,$tweak-16+12($sp)
+	st	$s0,0($out,$inp)
+	st	$s1,4($out,$inp)
+	st	$s2,8($out,$inp)
+	st	$s3,12($out,$inp)
+
+	la	$i3,0($out,$inp)	# put aside real $out
+.Lxts_dec_steal:
+	llgc	%r0,16($inp)
+	llgc	%r1,0($out,$inp)
+	stc	%r0,0($out,$inp)
+	stc	%r1,16($out,$inp)
+	la	$inp,1($inp)
+	brct	$len,.Lxts_dec_steal
+	la	$out,0($i3)		# restore real $out
+
+	lm	$s0,$s3,$tweak($sp)	# load the 1st tweak
+	x	$s0,0($out)		# tweak^=*(inp)|stolen cipher-text
+	x	$s1,4($out)
+	x	$s2,8($out)
+	x	$s3,12($out)
+	st${g}	$out,4*$SIZE_T($sp)
+	la	$key,0($key1)
+	bras	$ra,_s390x_AES_decrypt
+	l${g}	$out,4*$SIZE_T($sp)
+	x	$s0,$tweak+0($sp)	# ^=tweak
+	x	$s1,$tweak+4($sp)
+	x	$s2,$tweak+8($sp)
+	x	$s3,$tweak+12($sp)
+	st	$s0,0($out)
+	st	$s1,4($out)
+	st	$s2,8($out)
+	st	$s3,12($out)
+	stg	$sp,$tweak-16+0($sp)	# wipe 2nd tweak
+	stg	$sp,$tweak-16+8($sp)
+.Lxts_dec_done:
+	stg	$sp,$tweak+0($sp)	# wipe tweak
+	stg	$sp,$twesk+8($sp)
+	lm${g}	%r6,$ra,6*$SIZE_T($sp)
+	br	$ra
+.size	AES_xts_decrypt,.-AES_xts_decrypt
 ___
 }
 $code.=<<___;
 .string	"AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 print $code;
+close STDOUT;	# force flush
diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
index c57b3a2d6d..403c4d1290 100755
--- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
+++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl
@@ -1176,6 +1176,7 @@ ___
 # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
 # undesired effect, so just omit them and sacrifice some portion of
 # percent in performance...
-$code =~ s/fmovs.*$//gem;
+$code =~ s/fmovs.*$//gm;
 
 print $code;
+close STDOUT;	# ensure flush
diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
index a545e892ae..48fa857d5b 100755
--- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl
@@ -588,6 +588,9 @@ $code.=<<___;
 .globl	AES_encrypt
 .type	AES_encrypt,\@function,3
 .align	16
+.globl	asm_AES_encrypt
+.hidden	asm_AES_encrypt
+asm_AES_encrypt:
 AES_encrypt:
 	push	%rbx
 	push	%rbp
@@ -1184,6 +1187,9 @@ $code.=<<___;
 .globl	AES_decrypt
 .type	AES_decrypt,\@function,3
 .align	16
+.globl	asm_AES_decrypt
+.hidden	asm_AES_decrypt
+asm_AES_decrypt:
 AES_decrypt:
 	push	%rbx
 	push	%rbp
@@ -1277,13 +1283,13 @@ $code.=<<___;
 ___
 }
 
-# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_encrypt_key
-.type	AES_set_encrypt_key,\@function,3
+.globl	private_AES_set_encrypt_key
+.type	private_AES_set_encrypt_key,\@function,3
 .align	16
-AES_set_encrypt_key:
+private_AES_set_encrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12			# redundant, but allows to share 
@@ -1304,7 +1310,7 @@ AES_set_encrypt_key:
 	add	\$56,%rsp
 .Lenc_key_epilogue:
 	ret
-.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+.size	private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
 
 .type	_x86_64_AES_set_encrypt_key,\@abi-omnipotent
 .align	16
@@ -1547,13 +1553,13 @@ $code.=<<___;
 ___
 }
 
-# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
+# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
 #                        AES_KEY *key)
 $code.=<<___;
-.globl	AES_set_decrypt_key
-.type	AES_set_decrypt_key,\@function,3
+.globl	private_AES_set_decrypt_key
+.type	private_AES_set_decrypt_key,\@function,3
 .align	16
-AES_set_decrypt_key:
+private_AES_set_decrypt_key:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -1622,7 +1628,7 @@ $code.=<<___;
 	add	\$56,%rsp
 .Ldec_key_epilogue:
 	ret
-.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+.size	private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
 ___
 
 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -1648,6 +1654,9 @@ $code.=<<___;
 .type	AES_cbc_encrypt,\@function,6
 .align	16
 .extern	OPENSSL_ia32cap_P
+.globl	asm_AES_cbc_encrypt
+.hidden	asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
 AES_cbc_encrypt:
 	cmp	\$0,%rdx	# check length
 	je	.Lcbc_epilogue
@@ -2766,13 +2775,13 @@ cbc_se_handler:
 	.rva	.LSEH_end_AES_decrypt
 	.rva	.LSEH_info_AES_decrypt
 
-	.rva	.LSEH_begin_AES_set_encrypt_key
-	.rva	.LSEH_end_AES_set_encrypt_key
-	.rva	.LSEH_info_AES_set_encrypt_key
+	.rva	.LSEH_begin_private_AES_set_encrypt_key
+	.rva	.LSEH_end_private_AES_set_encrypt_key
+	.rva	.LSEH_info_private_AES_set_encrypt_key
 
-	.rva	.LSEH_begin_AES_set_decrypt_key
-	.rva	.LSEH_end_AES_set_decrypt_key
-	.rva	.LSEH_info_AES_set_decrypt_key
+	.rva	.LSEH_begin_private_AES_set_decrypt_key
+	.rva	.LSEH_end_private_AES_set_decrypt_key
+	.rva	.LSEH_info_private_AES_set_decrypt_key
 
 	.rva	.LSEH_begin_AES_cbc_encrypt
 	.rva	.LSEH_end_AES_cbc_encrypt
@@ -2788,11 +2797,11 @@ cbc_se_handler:
 	.byte	9,0,0,0
 	.rva	block_se_handler
 	.rva	.Ldec_prologue,.Ldec_epilogue	# HandlerData[]
-.LSEH_info_AES_set_encrypt_key:
+.LSEH_info_private_AES_set_encrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Lenc_key_prologue,.Lenc_key_epilogue	# HandlerData[]
-.LSEH_info_AES_set_decrypt_key:
+.LSEH_info_private_AES_set_decrypt_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 	.rva	.Ldec_key_prologue,.Ldec_key_epilogue	# HandlerData[]
diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
new file mode 100644
index 0000000000..c6f6b3334a
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl
@@ -0,0 +1,1249 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# June 2011
+#
+# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
+# in http://download.intel.com/design/intarch/papers/323686.pdf, is
+# that since AESNI-CBC encrypt exhibit *very* low instruction-level
+# parallelism, interleaving it with another algorithm would allow to
+# utilize processor resources better and achieve better performance.
+# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
+# AESNI code is weaved into it. Below are performance numbers in
+# cycles per processed byte, less is better, for standalone AESNI-CBC
+# encrypt, sum of the latter and standalone SHA1, and "stitched"
+# subroutine:
+#
+#		AES-128-CBC	+SHA1		stitch      gain
+# Westmere	3.77[+5.6]	9.37		6.65	    +41%
+# Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
+#
+#		AES-192-CBC
+# Westmere	4.51		10.11		6.97	    +45%
+# Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
+#
+#		AES-256-CBC
+# Westmere	5.25		10.85		7.25	    +50%
+# Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
+#
+# (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
+#	background information. Above numbers in parentheses are SSSE3
+#	results collected on AVX-capable CPU, i.e. apply on OSes that
+#	don't support AVX.
+#
+# Needless to mention that it makes no sense to implement "stitched"
+# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
+# fully utilize parallelism, so stitching would not give any gain
+# anyway. Well, there might be some, e.g. because of better cache
+# locality... For reference, here are performance results for
+# standalone AESNI-CBC decrypt:
+#
+#		AES-128-CBC	AES-192-CBC	AES-256-CBC
+# Westmere	1.31		1.55		1.80
+# Sandy Bridge	0.93		1.06		1.22
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# void aesni_cbc_sha1_enc(const void *inp,
+#			void *out,
+#			size_t length,
+#			const AES_KEY *key,
+#			unsigned char *iv,
+#			SHA_CTX *ctx,
+#			const void *in0);
+
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
+
+.globl	aesni_cbc_sha1_enc
+.type	aesni_cbc_sha1_enc,\@abi-omnipotent
+.align	16
+aesni_cbc_sha1_enc:
+	# caller should check for SSSE3 and AES-NI bits
+	mov	OPENSSL_ia32cap_P+0(%rip),%r10d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r11d
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r11d		# mask AVX bit
+	and	\$`1<<30`,%r10d		# mask "Intel CPU" bit
+	or	%r11d,%r10d
+	cmp	\$`1<<28|1<<30`,%r10d
+	je	aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+	jmp	aesni_cbc_sha1_enc_ssse3
+	ret
+.size	aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
+___
+
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0; my $jj=0; my $r=0; my $sn=0;
+my $K_XX_XX="%r11";
+my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
+my @rndkey=("%xmm14","%xmm15");
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_ssse3,\@function,6
+.align	16
+aesni_cbc_sha1_enc_ssse3:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_ssse3		# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	movdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	movups	($key),$rndkey0		# $key[0]
+	movups	16($key),$rndkey[0]	# forward reference
+	jmp	.Loop_ssse3
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	movups		`16*$n`($in0),$in		# load input
+	xorps		$rndkey0,$in
+___
+      $code.=<<___ if ($n);
+	movups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	xorps		$in,$iv
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Laesenclast$sn
+	movups		`32+16*($k+0)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+1)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+	je		.Laesenclast$sn
+	movups		`32+16*($k+2)`($key),$rndkey[1]
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*($k+3)`($key),$rndkey[0]
+	aesenc		$rndkey[1],$iv
+.Laesenclast$sn:
+	aesenclast	$rndkey[0],$iv
+	movups		16($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	aesenc		$rndkey[0],$iv
+	movups		`32+16*$k`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_20_39 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k = (($jj+1)*8/20)*20*$n/8;	# 8 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+
+sub body_40_59 () {
+  use integer;
+  my ($k,$n);
+  my @r=(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+	$n = scalar(@r);
+	$k=(($jj+1)*12/20)*20*$n/12;	# 12 aesencs per these 20 rounds
+	@r[$k%$n].='&$aesenc();'	if ($jj==$k/$n);
+	$jj++;
+    return @r;
+}
+$code.=<<___;
+.align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	movups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	movups	$iv,($ivp)			# write IV
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
+___
+
+$j=$jj=$r=$sn=0;
+
+if ($avx) {
+my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
+
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	aesni_cbc_sha1_enc_avx,\@function,6
+.align	16
+aesni_cbc_sha1_enc_avx:
+	mov	`($win64?56:8)`(%rsp),$inp	# load 7th argument
+	#shr	\$6,$len			# debugging artefact
+	#jz	.Lepilogue_avx			# debugging artefact
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	`-104-($win64?10*16:0)`(%rsp),%rsp
+	#mov	$in0,$inp			# debugging artefact
+	#lea	64(%rsp),$ctx			# debugging artefact
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,96+0(%rsp)
+	movaps	%xmm7,96+16(%rsp)
+	movaps	%xmm8,96+32(%rsp)
+	movaps	%xmm9,96+48(%rsp)
+	movaps	%xmm10,96+64(%rsp)
+	movaps	%xmm11,96+80(%rsp)
+	movaps	%xmm12,96+96(%rsp)
+	movaps	%xmm13,96+112(%rsp)
+	movaps	%xmm14,96+128(%rsp)
+	movaps	%xmm15,96+144(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	vzeroall
+	mov	$in0,%r12			# reassign arguments
+	mov	$out,%r13
+	mov	$len,%r14
+	mov	$key,%r15
+	vmovdqu	($ivp),$iv			# load IV
+	mov	$ivp,88(%rsp)			# save $ivp
+___
+my ($in0,$out,$len,$key)=map("%r$_",(12..15));	# reassign arguments
+my $rounds="${ivp}d";
+$code.=<<___;
+	shl	\$6,$len
+	sub	$in0,$out
+	mov	240($key),$rounds
+	add	\$112,$key		# size optimization
+	add	$inp,$len		# end of input
+
+	lea	K_XX_XX(%rip),$K_XX_XX
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	vmovups	-112($key),$rndkey0	# $key[0]
+	vmovups	16-112($key),$rndkey[0]	# forward reference
+	jmp	.Loop_avx
+___
+
+my $aesenc=sub {
+  use integer;
+  my ($n,$k)=($r/10,$r%10);
+    if ($k==0) {
+      $code.=<<___;
+	vmovups		`16*$n`($in0),$in		# load input
+	vxorps		$rndkey0,$in,$in
+___
+      $code.=<<___ if ($n);
+	vmovups		$iv,`16*($n-1)`($out,$in0)	# write output
+___
+      $code.=<<___;
+	vxorps		$in,$iv,$iv
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    } elsif ($k==9) {
+      $sn++;
+      $code.=<<___;
+	cmp		\$11,$rounds
+	jb		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+0)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+1)-112`($key),$rndkey[0]
+	je		.Lvaesenclast$sn
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*($k+2)-112`($key),$rndkey[1]
+	vaesenc		$rndkey[1],$iv,$iv
+	vmovups		`32+16*($k+3)-112`($key),$rndkey[0]
+.Lvaesenclast$sn:
+	vaesenclast	$rndkey[0],$iv,$iv
+	vmovups		16-112($key),$rndkey[1]		# forward reference
+___
+    } else {
+      $code.=<<___;
+	vaesenc		$rndkey[0],$iv,$iv
+	vmovups		`32+16*$k-112`($key),$rndkey[1]
+___
+    }
+    $r++;	unshift(@rndkey,pop(@rndkey));
+};
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$len);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+				$saved_r=$r; @saved_rndkey=@rndkey;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	lea	64($in0),$in0
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$jj=$j=$saved_j; @V=@saved_V;
+				$r=$saved_r;     @rndkey=@saved_rndkey;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vmovups	$iv,48($out,$in0)		# write output
+	mov	88(%rsp),$ivp			# restore $ivp
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	vmovups	$iv,($ivp)			# write IV
+	vzeroall
+___
+$code.=<<___ if ($win64);
+	movaps	96+0(%rsp),%xmm6
+	movaps	96+16(%rsp),%xmm7
+	movaps	96+32(%rsp),%xmm8
+	movaps	96+48(%rsp),%xmm9
+	movaps	96+64(%rsp),%xmm10
+	movaps	96+80(%rsp),%xmm11
+	movaps	96+96(%rsp),%xmm12
+	movaps	96+112(%rsp),%xmm13
+	movaps	96+128(%rsp),%xmm14
+	movaps	96+144(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	`104+($win64?10*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+
+.asciz	"AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	96(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`104+10*16`(%rax),%rax	# adjust stack pointer
+
+	mov	0(%rax),%r15
+	mov	8(%rax),%r14
+	mov	16(%rax),%r13
+	mov	24(%rax),%r12
+	mov	32(%rax),%rbp
+	mov	40(%rax),%rbx
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	ssse3_handler,.-ssse3_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_ssse3
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_end_aesni_cbc_sha1_enc_avx
+	.rva	.LSEH_info_aesni_cbc_sha1_enc_avx
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_aesni_cbc_sha1_enc_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_aesni_cbc_sha1_enc_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
+___
+}
+
+####################################################################
+sub rex {
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
+}
+
+sub aesni {
+  my $line=shift;
+  my @opcode=(0x66);
+
+    if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+	my %opcodelet = (
+		"aesenc" => 0xdc,	"aesenclast" => 0xdd
+	);
+	return undef if (!defined($opcodelet{$1}));
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x38,$opcodelet{$1};
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);	# ModR/M
+	return ".byte\t".join(',',@opcode);
+    }
+    return $line;
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl
new file mode 100644
index 0000000000..3dc345b585
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/aesni-x86.pl
@@ -0,0 +1,2189 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for Intel AES-NI extension. In
+# OpenSSL context it's used with Intel engine, but can also be used as
+# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
+# details].
+#
+# Performance.
+#
+# To start with see corresponding paragraph in aesni-x86_64.pl...
+# Instead of filling table similar to one found there I've chosen to
+# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
+# The simplified table below represents 32-bit performance relative
+# to 64-bit one in every given point. Ratios vary for different
+# encryption modes, therefore interval values.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+#	53-67%      67-84%      91-94%      95-98%      97-99.5%
+#
+# Lower ratios for smaller block sizes are perfectly understandable,
+# because function call overhead is higher in 32-bit mode. Largest
+# 8-KB block performance is virtually same: 32-bit code is less than
+# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
+
+# January 2011
+#
+# See aesni-x86_64.pl for details. Unlike x86_64 version this module
+# interleaves at most 6 aes[enc|dec] instructions, because there are
+# not enough registers for 8x interleave [which should be optimal for
+# Sandy Bridge]. Actually, performance results for 6x interleave
+# factor presented in aesni-x86_64.pl (except for CTR) are for this
+# module.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
+
+$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
+			# generates drop-in replacement for
+			# crypto/aes/asm/aes-586.pl:-)
+$inline=1;		# inline _aesni_[en|de]crypt
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+if ($PREFIX eq "aesni")	{ $movekey=*movups; }
+else			{ $movekey=*movups; }
+
+$len="eax";
+$rounds="ecx";
+$key="edx";
+$inp="esi";
+$out="edi";
+$rounds_="ebx";	# backup copy for $rounds
+$key_="ebp";	# backup copy for $key
+
+$rndkey0="xmm0";
+$rndkey1="xmm1";
+$inout0="xmm2";
+$inout1="xmm3";
+$inout2="xmm4";
+$inout3="xmm5";	$in1="xmm5";
+$inout4="xmm6";	$in0="xmm6";
+$inout5="xmm7";	$ivec="xmm7";
+
+# AESNI extenstion
+sub aeskeygenassist
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
+}
+sub aescommon
+{ my($opcodelet,$dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
+}
+sub aesimc	{ aescommon(0xdb,@_); }
+sub aesenc	{ aescommon(0xdc,@_); }
+sub aesenclast	{ aescommon(0xdd,@_); }
+sub aesdec	{ aescommon(0xde,@_); }
+sub aesdeclast	{ aescommon(0xdf,@_); }
+
+# Inline version of internal aesni_[en|de]crypt1
+{ my $sn;
+sub aesni_inline_generate1
+{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
+  $sn++;
+
+    &$movekey		($rndkey0,&QWP(0,$key));
+    &$movekey		($rndkey1,&QWP(16,$key));
+    &xorps		($ivec,$rndkey0)	if (defined($ivec));
+    &lea		($key,&DWP(32,$key));
+    &xorps		($inout,$ivec)		if (defined($ivec));
+    &xorps		($inout,$rndkey0)	if (!defined($ivec));
+    &set_label("${p}1_loop_$sn");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&dec		($rounds);
+	&$movekey	($rndkey1,&QWP(0,$key));
+	&lea		($key,&DWP(16,$key));
+    &jnz		(&label("${p}1_loop_$sn"));
+    eval"&aes${p}last	($inout,$rndkey1)";
+}}
+
+sub aesni_generate1	# fully unrolled loop
+{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
+
+    &function_begin_B("_aesni_${p}rypt1");
+	&movups		($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(0x10,$key));
+	&xorps		($inout,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0x20,$key));
+	&lea		($key,&DWP(0x30,$key));
+	&cmp		($rounds,11);
+	&jb		(&label("${p}128"));
+	&lea		($key,&DWP(0x20,$key));
+	&je		(&label("${p}192"));
+	&lea		($key,&DWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x30,$key));
+    &set_label("${p}192");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(-0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(-0x10,$key));
+    &set_label("${p}128");
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x10,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x20,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x30,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x40,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x50,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(0x60,$key));
+	eval"&aes${p}	($inout,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0x70,$key));
+	eval"&aes${p}	($inout,$rndkey1)";
+    eval"&aes${p}last	($inout,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt1");
+}
+
+# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("enc") if (!$inline);
+&function_begin_B("${PREFIX}_encrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_encrypt");
+
+# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
+&aesni_generate1("dec") if(!$inline);
+&function_begin_B("${PREFIX}_decrypt");
+	&mov	("eax",&wparam(0));
+	&mov	($key,&wparam(2));
+	&movups	($inout0,&QWP(0,"eax"));
+	&mov	($rounds,&DWP(240,$key));
+	&mov	("eax",&wparam(1));
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,"eax"),$inout0);
+	&ret	();
+&function_end_B("${PREFIX}_decrypt");
+
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
+# utilization, i.e. when subroutine's throughput is virtually same as
+# of non-interleaved subroutine [for number of input blocks up to 3].
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x, but it's unfeasible to accommodate it
+# in XMM registers addreassable in 32-bit mode and therefore 6x is
+# used instead...
+
+sub aesni_generate3
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt3");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}3_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("${p}3_loop"));
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt3");
+}
+
+# 4x interleave is implemented to improve small block performance,
+# most notably [and naturally] 4 block by ~30%. One can argue that one
+# should have implemented 5x as well, but improvement  would be <20%,
+# so it's not worth it...
+sub aesni_generate4
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt4");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&shr		($rounds,1);
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&pxor		($inout2,$rndkey0);
+	&pxor		($inout3,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+    &set_label("${p}4_loop");
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}4_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt4");
+}
+
+sub aesni_generate6
+{ my $p=shift;
+
+    &function_begin_B("_aesni_${p}rypt6");
+    &static_label("_aesni_${p}rypt6_enter");
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&lea		($key,&DWP(32,$key));
+	&xorps		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);	# pxor does better here
+	eval"&aes${p}	($inout0,$rndkey1)";
+	&pxor		($inout2,$rndkey0);
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&pxor		($inout3,$rndkey0);
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	&pxor		($inout4,$rndkey0);
+	eval"&aes${p}	($inout3,$rndkey1)";
+	&pxor		($inout5,$rndkey0);
+	eval"&aes${p}	($inout4,$rndkey1)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+	eval"&aes${p}	($inout5,$rndkey1)";
+	&jmp		(&label("_aesni_${p}rypt6_enter"));
+
+    &set_label("${p}6_loop",16);
+	eval"&aes${p}	($inout0,$rndkey1)";
+	eval"&aes${p}	($inout1,$rndkey1)";
+	&dec		($rounds);
+	eval"&aes${p}	($inout2,$rndkey1)";
+	eval"&aes${p}	($inout3,$rndkey1)";
+	eval"&aes${p}	($inout4,$rndkey1)";
+	eval"&aes${p}	($inout5,$rndkey1)";
+    &set_label("_aesni_${p}rypt6_enter",16);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	eval"&aes${p}	($inout0,$rndkey0)";
+	eval"&aes${p}	($inout1,$rndkey0)";
+	&lea		($key,&DWP(32,$key));
+	eval"&aes${p}	($inout2,$rndkey0)";
+	eval"&aes${p}	($inout3,$rndkey0)";
+	eval"&aes${p}	($inout4,$rndkey0)";
+	eval"&aes${p}	($inout5,$rndkey0)";
+	&$movekey	($rndkey0,&QWP(0,$key));
+    &jnz		(&label("${p}6_loop"));
+
+    eval"&aes${p}	($inout0,$rndkey1)";
+    eval"&aes${p}	($inout1,$rndkey1)";
+    eval"&aes${p}	($inout2,$rndkey1)";
+    eval"&aes${p}	($inout3,$rndkey1)";
+    eval"&aes${p}	($inout4,$rndkey1)";
+    eval"&aes${p}	($inout5,$rndkey1)";
+    eval"&aes${p}last	($inout0,$rndkey0)";
+    eval"&aes${p}last	($inout1,$rndkey0)";
+    eval"&aes${p}last	($inout2,$rndkey0)";
+    eval"&aes${p}last	($inout3,$rndkey0)";
+    eval"&aes${p}last	($inout4,$rndkey0)";
+    eval"&aes${p}last	($inout5,$rndkey0)";
+    &ret();
+    &function_end_B("_aesni_${p}rypt6");
+}
+&aesni_generate3("enc") if ($PREFIX eq "aesni");
+&aesni_generate3("dec");
+&aesni_generate4("enc") if ($PREFIX eq "aesni");
+&aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+
+if ($PREFIX eq "aesni") {
+######################################################################
+# void aesni_ecb_encrypt (const void *in, void *out,
+#                         size_t length, const AES_KEY *key,
+#                         int enc);
+&function_begin("aesni_ecb_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&and	($len,-16);
+	&jz	(&label("ecb_ret"));
+	&mov	($rounds,&DWP(240,$key));
+	&test	($rounds_,$rounds_);
+	&jz	(&label("ecb_decrypt"));
+
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_enc_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_enc_loop6_enter"));
+
+&set_label("ecb_enc_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_enc_loop6_enter");
+
+	&call	("_aesni_encrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_enc_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_enc_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_enc_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_enc_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_enc_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_enc_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_enc_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&jmp	(&label("ecb_ret"));
+######################################################################
+&set_label("ecb_decrypt",16);
+	&mov	($key_,$key);		# backup $key
+	&mov	($rounds_,$rounds);	# backup $rounds
+	&cmp	($len,0x60);
+	&jb	(&label("ecb_dec_tail"));
+
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&sub	($len,0x60);
+	&jmp	(&label("ecb_dec_loop6_enter"));
+
+&set_label("ecb_dec_loop6",16);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+&set_label("ecb_dec_loop6_enter");
+
+	&call	("_aesni_decrypt6");
+
+	&mov	($key,$key_);		# restore $key
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&sub	($len,0x60);
+	&jnc	(&label("ecb_dec_loop6"));
+
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+	&add	($len,0x60);
+	&jz	(&label("ecb_ret"));
+
+&set_label("ecb_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&cmp	($len,0x20);
+	&jb	(&label("ecb_dec_one"));
+	&movups	($inout1,&QWP(0x10,$inp));
+	&je	(&label("ecb_dec_two"));
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x40);
+	&jb	(&label("ecb_dec_three"));
+	&movups	($inout3,&QWP(0x30,$inp));
+	&je	(&label("ecb_dec_four"));
+	&movups	($inout4,&QWP(0x40,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ecb_ret"));
+
+&set_label("ecb_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ecb_ret");
+&function_end("aesni_ecb_encrypt");
+
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{ my $cmac=$inout1;
+&function_begin("aesni_ccm64_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&shr	($rounds,1);
+	&lea	($key_,&DWP(0,$key));
+	&movdqa	($inout3,&QWP(0,"esp"));
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds_,$rounds);
+	&pshufb	($ivec,$inout3);
+
+&set_label("ccm64_enc_outer");
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&mov		($rounds,$rounds_);
+	&movups		($in0,&QWP(0,$inp));
+
+	&xorps		($inout0,$rndkey0);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($rndkey0,$in0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($cmac,$rndkey0);		# cmac^=inp
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_enc2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_enc2_loop"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+
+	&dec	($len);
+	&lea	($inp,&DWP(16,$inp));
+	&xorps	($in0,$inout0);			# inp^=E(ivec)
+	&movdqa	($inout0,$ivec);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+	&jnz	(&label("ccm64_enc_outer"));
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_encrypt_blocks");
+
+&function_begin("aesni_ccm64_decrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($rounds,&wparam(5));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
+
+	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
+	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
+	&mov	($rounds,&DWP(240,$key));
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds_,1);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds_);
+	&mov	(&DWP(20,"esp"),$key_);
+	&mov	(&DWP(24,"esp"),$key_);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
+	&movdqa	($inout0,$ivec);
+
+	&mov	($key_,$key);
+	&mov	($rounds_,$rounds);
+
+	&pshufb	($ivec,$inout3);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));		# load inp
+	&paddq	($ivec,&QWP(16,"esp"));
+	&lea	($inp,&QWP(16,$inp));
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_outer",16);
+	&xorps	($in0,$inout0);			# inp ^= E(ivec)
+	&movdqa	($inout0,$ivec);
+	&mov	($rounds,$rounds_);
+	&movups	(&QWP(0,$out),$in0);		# save output
+	&lea	($out,&DWP(16,$out));
+	&pshufb	($inout0,$inout3);
+
+	&sub	($len,1);
+	&jz	(&label("ccm64_dec_break"));
+
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&shr		($rounds,1);
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&xorps		($in0,$rndkey0);
+	&lea		($key,&DWP(32,$key_));
+	&xorps		($inout0,$rndkey0);
+	&xorps		($cmac,$in0);		# cmac^=out
+	&$movekey	($rndkey0,&QWP(0,$key));
+
+&set_label("ccm64_dec2_loop");
+	&aesenc		($inout0,$rndkey1);
+	&dec		($rounds);
+	&aesenc		($cmac,$rndkey1);
+	&$movekey	($rndkey1,&QWP(16,$key));
+	&aesenc		($inout0,$rndkey0);
+	&lea		($key,&DWP(32,$key));
+	&aesenc		($cmac,$rndkey0);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&jnz		(&label("ccm64_dec2_loop"));
+	&movups		($in0,&QWP(0,$inp));	# load inp
+	&paddq		($ivec,&QWP(16,"esp"));
+	&aesenc		($inout0,$rndkey1);
+	&aesenc		($cmac,$rndkey1);
+	&lea		($inp,&QWP(16,$inp));
+	&aesenclast	($inout0,$rndkey0);
+	&aesenclast	($cmac,$rndkey0);
+	&jmp	(&label("ccm64_dec_outer"));
+
+&set_label("ccm64_dec_break",16);
+	&mov	($key,$key_);
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
+	else
+	{   &call	("_aesni_encrypt1",$cmac);	}
+
+	&mov	("esp",&DWP(48,"esp"));
+	&mov	($out,&wparam(5));
+	&movups	(&QWP(0,$out),$cmac);
+&function_end("aesni_ccm64_decrypt_blocks");
+}
+
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# stack layout:
+#	0	pshufb mask
+#	16	vector addend: 0,6,6,6
+# 	32	counter-less ivec
+#	48	1st triplet of counter vector
+#	64	2nd triplet of counter vector
+#	80	saved %esp
+
+&function_begin("aesni_ctr32_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($key_,"esp");
+	&sub	("esp",88);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(80,"esp"),$key_);
+
+	&cmp	($len,1);
+	&je	(&label("ctr32_one_shortcut"));
+
+	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds,6);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(20,"esp"),$rounds);
+	&mov	(&DWP(24,"esp"),$rounds);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
+	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter
+
+	&mov	($rounds,&DWP(240,$key));	# key->rounds
+
+	# compose 2 vectors of 3x32-bit counters
+	&bswap	($rounds_);
+	&pxor	($rndkey1,$rndkey1);
+	&pxor	($rndkey0,$rndkey0);
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
+	&pinsrd	($rndkey1,$rounds_,0);
+	&lea	($key_,&DWP(3,$rounds_));
+	&pinsrd	($rndkey0,$key_,0);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,1);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,1);
+	&inc	($rounds_);
+	&pinsrd	($rndkey1,$rounds_,2);
+	&inc	($key_);
+	&pinsrd	($rndkey0,$key_,2);
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+
+	&pshufd	($inout0,$rndkey1,3<<6);	# place counter to upper dword
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&cmp	($len,6);
+	&jb	(&label("ctr32_tail"));
+	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec
+	&shr	($rounds,1);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,6);
+	&jmp	(&label("ctr32_loop6"));
+
+&set_label("ctr32_loop6",16);
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&movdqa	($rndkey1,&QWP(32,"esp"));	# pull counter-less ivec
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout0,$rndkey1);		# merge counter-less ivec
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout1,$rndkey1);
+	&pshufd	($inout5,$rndkey0,1<<6);
+	&por	($inout2,$rndkey1);
+	&por	($inout3,$rndkey1);
+	&por	($inout4,$rndkey1);
+	&por	($inout5,$rndkey1);
+
+	# inlining _aesni_encrypt6's prologue gives ~4% improvement...
+	&$movekey	($rndkey0,&QWP(0,$key_));
+	&$movekey	($rndkey1,&QWP(16,$key_));
+	&lea		($key,&DWP(32,$key_));
+	&dec		($rounds);
+	&pxor		($inout0,$rndkey0);
+	&pxor		($inout1,$rndkey0);
+	&aesenc		($inout0,$rndkey1);
+	&pxor		($inout2,$rndkey0);
+	&aesenc		($inout1,$rndkey1);
+	&pxor		($inout3,$rndkey0);
+	&aesenc		($inout2,$rndkey1);
+	&pxor		($inout4,$rndkey0);
+	&aesenc		($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	&aesenc		($inout4,$rndkey1);
+	&$movekey	($rndkey0,&QWP(0,$key));
+	&aesenc		($inout5,$rndkey1);
+
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
+	&xorps	($inout2,$rndkey1);
+	&movdqa	($rndkey1,&QWP(48,"esp"));	# load 1st triplet
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+
+	&paddd	($rndkey1,$rndkey0);		# 1st triplet increment
+	&paddd	($rndkey0,&QWP(64,"esp"));	# 2nd triplet increment
+	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask
+
+	&movups	($inout1,&QWP(0x30,$inp));
+	&movups	($inout2,&QWP(0x40,$inp));
+	&xorps	($inout3,$inout1);
+	&movups	($inout1,&QWP(0x50,$inp));
+	&lea	($inp,&DWP(0x60,$inp));
+	&movdqa	(&QWP(48,"esp"),$rndkey1);	# save 1st triplet
+	&pshufb	($rndkey1,$inout0);		# byte swap
+	&xorps	($inout4,$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&xorps	($inout5,$inout1);
+	&movdqa	(&QWP(64,"esp"),$rndkey0);	# save 2nd triplet
+	&pshufb	($rndkey0,$inout0);		# byte swap
+	&movups	(&QWP(0x40,$out),$inout4);
+	&pshufd	($inout0,$rndkey1,3<<6);
+	&movups	(&QWP(0x50,$out),$inout5);
+	&lea	($out,&DWP(0x60,$out));
+
+	&mov	($rounds,$rounds_);
+	&pshufd	($inout1,$rndkey1,2<<6);
+	&sub	($len,6);
+	&jnc	(&label("ctr32_loop6"));
+
+	&add	($len,6);
+	&jz	(&label("ctr32_ret"));
+	&mov	($key,$key_);
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&movdqa	($inout5,&QWP(32,"esp"));	# pull count-less ivec
+
+&set_label("ctr32_tail");
+	&por	($inout0,$inout5);
+	&cmp	($len,2);
+	&jb	(&label("ctr32_one"));
+
+	&pshufd	($inout2,$rndkey1,1<<6);
+	&por	($inout1,$inout5);
+	&je	(&label("ctr32_two"));
+
+	&pshufd	($inout3,$rndkey0,3<<6);
+	&por	($inout2,$inout5);
+	&cmp	($len,4);
+	&jb	(&label("ctr32_three"));
+
+	&pshufd	($inout4,$rndkey0,2<<6);
+	&por	($inout3,$inout5);
+	&je	(&label("ctr32_four"));
+
+	&por	($inout4,$inout5);
+	&call	("_aesni_encrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout1,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout2,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout4,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&movups	(&QWP(0x40,$out),$inout4);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_one_shortcut",16);
+	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
+	&mov	($rounds,&DWP(240,$key));
+	
+&set_label("ctr32_one");
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));
+	&xorps	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	($inout3,&QWP(0,$inp));
+	&movups	($inout4,&QWP(0x10,$inp));
+	&xorps	($inout0,$inout3);
+	&movups	($inout5,&QWP(0x20,$inp));
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_four",16);
+	&call	("_aesni_encrypt4");
+	&movups	($inout4,&QWP(0,$inp));
+	&movups	($inout5,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout0,$inout4);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout1,$inout5);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+
+&set_label("ctr32_ret");
+	&mov	("esp",&DWP(80,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
+
+&function_begin("aesni_xts_encrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&and	("esp",-16);			# align stack
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&sub	($len,16*6);
+	&jc	(&label("xts_enc_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_enc_loop6"));
+
+&set_label("xts_enc_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesenc	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesenc	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesenc	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesenc	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesenc	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesenc	($inout5,$rndkey1);
+	&call		(&label("_aesni_encrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_enc_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_enc_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_enc_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_enc_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_enc_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_enc_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_enc_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_encrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout2);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_encrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_encrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_enc_done"));
+
+&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+	&movdqa	($inout3,$tweak);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_enc_steal"));
+
+&set_label("xts_enc_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_enc_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($inout3,$twtmp,0x13);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+&set_label("xts_enc_steal");
+	&movz	($rounds,&BP(0,$inp));
+	&movz	($key,&BP(-16,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(-16,$out),&LB($rounds));
+	&mov	(&BP(0,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_enc_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(-16,$out));	# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(-16,$out),$inout0);	# write output
+
+&set_label("xts_enc_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_encrypt");
+
+&function_begin("aesni_xts_decrypt");
+	&mov	($key,&wparam(4));		# key2
+	&mov	($inp,&wparam(5));		# clear-text tweak
+
+	&mov	($rounds,&DWP(240,$key));	# key2->rounds
+	&movups	($inout0,&QWP(0,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));		# key1
+
+	&mov	($key_,"esp");
+	&sub	("esp",16*7+8);
+	&and	("esp",-16);			# align stack
+
+	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
+	&test	($len,15);
+	&setnz	(&LB($rounds_));
+	&shl	($rounds_,4);
+	&sub	($len,$rounds_);
+
+	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
+	&mov	(&DWP(16*6+4,"esp"),0);
+	&mov	(&DWP(16*6+8,"esp"),1);
+	&mov	(&DWP(16*6+12,"esp"),0);
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
+	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp
+
+	&mov	($rounds,&DWP(240,$key));	# key1->rounds
+	&mov	($key_,$key);			# backup $key
+	&mov	($rounds_,$rounds);		# backup $rounds
+
+	&movdqa	($tweak,$inout0);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+
+	&and	($len,-16);
+	&sub	($len,16*6);
+	&jc	(&label("xts_dec_short"));
+
+	&shr	($rounds,1);
+	&mov	($rounds_,$rounds);
+	&jmp	(&label("xts_dec_loop6"));
+
+&set_label("xts_dec_loop6",16);
+	for ($i=0;$i<4;$i++) {
+	    &pshufd	($twres,$twtmp,0x13);
+	    &pxor	($twtmp,$twtmp);
+	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
+	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
+	    &pand	($twres,$twmask);	# isolate carry and residue
+	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
+	    &pxor	($tweak,$twres);
+	}
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	 &$movekey	($rndkey0,&QWP(0,$key_));
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	 &movups	($inout0,&QWP(0,$inp));	# load input
+	&pxor	($inout5,$tweak);
+
+	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	 &pxor		($inout1,$rndkey0);
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	 &pxor		($inout2,$rndkey0);
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	 &pxor		($inout3,$rndkey0);
+	&movdqu	($rndkey1,&QWP(16*5,$inp));
+	 &pxor		($inout4,$rndkey0);
+	&lea	($inp,&DWP(16*6,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
+	&pxor	($inout5,$rndkey1);
+
+	 &$movekey	($rndkey1,&QWP(16,$key_));
+	 &lea		($key,&DWP(32,$key_));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	 &aesdec	($inout0,$rndkey1);
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	 &aesdec	($inout1,$rndkey1);
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	 &dec		($rounds);
+	 &aesdec	($inout2,$rndkey1);
+	&pxor	($inout4,&QWP(16*4,"esp"));
+	 &aesdec	($inout3,$rndkey1);
+	&pxor		($inout5,$rndkey0);
+	 &aesdec	($inout4,$rndkey1);
+	 &$movekey	($rndkey0,&QWP(0,$key));
+	 &aesdec	($inout5,$rndkey1);
+	&call		(&label("_aesni_decrypt6_enter"));
+
+	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
+       &pxor	($twtmp,$twtmp);
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*2,$out),$inout2);
+	&xorps	($inout4,&QWP(16*4,"esp"));
+	&movups	(&QWP(16*3,$out),$inout3);
+	&xorps	($inout5,$tweak);
+	&movups	(&QWP(16*4,$out),$inout4);
+       &pshufd	($twres,$twtmp,0x13);
+	&movups	(&QWP(16*5,$out),$inout5);
+	&lea	($out,&DWP(16*6,$out));
+       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87
+
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	($rounds,$rounds_);		# restore $rounds
+	&pxor	($tweak,$twres);
+
+	&sub	($len,16*6);
+	&jnc	(&label("xts_dec_loop6"));
+
+	&lea	($rounds,&DWP(1,"",$rounds,2));	# restore $rounds
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds_,$rounds);
+
+&set_label("xts_dec_short");
+	&add	($len,16*6);
+	&jz	(&label("xts_dec_done6x"));
+
+	&movdqa	($inout3,$tweak);		# put aside previous tweak
+	&cmp	($len,0x20);
+	&jb	(&label("xts_dec_one"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&je	(&label("xts_dec_two"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&cmp	($len,0x40);
+	&jb	(&label("xts_dec_three"));
+
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($inout5,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+	&movdqa	(&QWP(16*0,"esp"),$inout3);
+	&movdqa	(&QWP(16*1,"esp"),$inout4);
+	&je	(&label("xts_dec_four"));
+
+	&movdqa	(&QWP(16*2,"esp"),$inout5);
+	&pshufd	($inout5,$twtmp,0x13);
+	&movdqa	(&QWP(16*3,"esp"),$tweak);
+	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
+	&pand	($inout5,$twmask);		# isolate carry and residue
+	&pxor	($inout5,$tweak);
+
+	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
+	&movdqu	($inout1,&QWP(16*1,$inp));
+	&movdqu	($inout2,&QWP(16*2,$inp));
+	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movdqu	($inout3,&QWP(16*3,$inp));
+	&pxor	($inout1,&QWP(16*1,"esp"));
+	&movdqu	($inout4,&QWP(16*4,$inp));
+	&pxor	($inout2,&QWP(16*2,"esp"));
+	&lea	($inp,&DWP(16*5,$inp));
+	&pxor	($inout3,&QWP(16*3,"esp"));
+	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
+	&pxor	($inout4,$inout5);
+
+	&call	("_aesni_decrypt6");
+
+	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,&QWP(16*2,"esp"));
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,&QWP(16*3,"esp"));
+	&movups	(&QWP(16*1,$out),$inout1);
+	&xorps	($inout4,$tweak);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&movups	(&QWP(16*4,$out),$inout4);
+	&lea	($out,&DWP(16*5,$out));
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_one",16);
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&lea	($inp,&DWP(16*1,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&lea	($out,&DWP(16*1,$out));
+
+	&movdqa	($tweak,$inout3);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_two",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&lea	($inp,&DWP(16*2,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&lea	($out,&DWP(16*2,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_three",16);
+	&movaps	($inout5,$tweak);		# put aside last tweak
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&lea	($inp,&DWP(16*3,$inp));
+	&xorps	($inout0,$inout3);		# input^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+
+	&call	("_aesni_decrypt3");
+
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&xorps	($inout1,$inout4);
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&lea	($out,&DWP(16*3,$out));
+
+	&movdqa	($tweak,$inout5);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_four",16);
+	&movaps	($inout4,$tweak);		# put aside last tweak
+
+	&movups	($inout0,&QWP(16*0,$inp));	# load input
+	&movups	($inout1,&QWP(16*1,$inp));
+	&movups	($inout2,&QWP(16*2,$inp));
+	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
+	&movups	($inout3,&QWP(16*3,$inp));
+	&lea	($inp,&DWP(16*4,$inp));
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&xorps	($inout3,$inout4);
+
+	&call	("_aesni_decrypt4");
+
+	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
+	&xorps	($inout1,&QWP(16*1,"esp"));
+	&xorps	($inout2,$inout5);
+	&movups	(&QWP(16*0,$out),$inout0);	# write output
+	&xorps	($inout3,$inout4);
+	&movups	(&QWP(16*1,$out),$inout1);
+	&movups	(&QWP(16*2,$out),$inout2);
+	&movups	(&QWP(16*3,$out),$inout3);
+	&lea	($out,&DWP(16*4,$out));
+
+	&movdqa	($tweak,$inout4);		# last tweak
+	&jmp	(&label("xts_dec_done"));
+
+&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&jmp	(&label("xts_dec_only_one_more"));
+
+&set_label("xts_dec_done",16);
+	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
+	&pxor	($twtmp,$twtmp);
+	&and	($len,15);
+	&jz	(&label("xts_dec_ret"));
+
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
+	&pshufd	($twres,$twtmp,0x13);
+	&pxor	($twtmp,$twtmp);
+	&movdqa	($twmask,&QWP(16*6,"esp"));
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($twres,$twmask);		# isolate carry and residue
+	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
+	&pxor	($tweak,$twres);
+
+&set_label("xts_dec_only_one_more");
+	&pshufd	($inout3,$twtmp,0x13);
+	&movdqa	($inout4,$tweak);		# put aside previous tweak
+	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
+	&pand	($inout3,$twmask);		# isolate carry and residue
+	&pxor	($inout3,$tweak);
+
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$inp));		# load input
+	&xorps	($inout0,$inout3);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout3);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_steal");
+	&movz	($rounds,&BP(16,$inp));
+	&movz	($key,&BP(0,$out));
+	&lea	($inp,&DWP(1,$inp));
+	&mov	(&BP(0,$out),&LB($rounds));
+	&mov	(&BP(16,$out),&LB($key));
+	&lea	($out,&DWP(1,$out));
+	&sub	($len,1);
+	&jnz	(&label("xts_dec_steal"));
+
+	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
+	&mov	($key,$key_);			# restore $key
+	&mov	($rounds,$rounds_);		# restore $rounds
+
+	&movups	($inout0,&QWP(0,$out));		# load input
+	&xorps	($inout0,$inout4);		# input^=tweak
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$inout4);		# output^=tweak
+	&movups	(&QWP(0,$out),$inout0);		# write output
+
+&set_label("xts_dec_ret");
+	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
+&function_end("aesni_xts_decrypt");
+}
+}
+
+######################################################################
+# void $PREFIX_cbc_encrypt (const void *inp, void *out,
+#                           size_t length, const AES_KEY *key,
+#                           unsigned char *ivp,const int enc);
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));
+	&mov	($rounds_,"esp");
+	&mov	($out,&wparam(1));
+	&sub	($rounds_,24);
+	&mov	($len,&wparam(2));
+	&and	($rounds_,-16);
+	&mov	($key,&wparam(3));
+	&mov	($key_,&wparam(4));
+	&test	($len,$len);
+	&jz	(&label("cbc_abort"));
+
+	&cmp	(&wparam(5),0);
+	&xchg	($rounds_,"esp");		# alloca
+	&movups	($ivec,&QWP(0,$key_));		# load IV
+	&mov	($rounds,&DWP(240,$key));
+	&mov	($key_,$key);			# backup $key
+	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
+	&mov	($rounds_,$rounds);		# backup $rounds
+	&je	(&label("cbc_decrypt"));
+
+	&movaps	($inout0,$ivec);
+	&cmp	($len,16);
+	&jb	(&label("cbc_enc_tail"));
+	&sub	($len,16);
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movups	($ivec,&QWP(0,$inp));		# input actually
+	&lea	($inp,&DWP(16,$inp));
+	if ($inline)
+	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
+	else
+	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($key,$key_);		# restore $key
+	&movups	(&QWP(0,$out),$inout0);	# store output
+	&lea	($out,&DWP(16,$out));
+	&sub	($len,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&add	($len,16);
+	&jnz	(&label("cbc_enc_tail"));
+	&movaps	($ivec,$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_enc_tail");
+	&mov	("ecx",$len);		# zaps $rounds
+	&data_word(0xA4F3F689);		# rep movsb
+	&mov	("ecx",16);		# zero tail
+	&sub	("ecx",$len);
+	&xor	("eax","eax");		# zaps $len
+	&data_word(0xAAF3F689);		# rep stosb
+	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
+	&mov	($rounds,$rounds_);	# restore $rounds
+	&mov	($inp,$out);		# $inp and $out are the same
+	&mov	($key,$key_);		# restore $key
+	&jmp	(&label("cbc_enc_loop"));
+######################################################################
+&set_label("cbc_decrypt",16);
+	&cmp	($len,0x50);
+	&jbe	(&label("cbc_dec_tail"));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_loop6_enter"));
+
+&set_label("cbc_dec_loop6",16);
+	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
+	&movups	(&QWP(0,$out),$inout5);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_loop6_enter");
+	&movdqu	($inout0,&QWP(0,$inp));
+	&movdqu	($inout1,&QWP(0x10,$inp));
+	&movdqu	($inout2,&QWP(0x20,$inp));
+	&movdqu	($inout3,&QWP(0x30,$inp));
+	&movdqu	($inout4,&QWP(0x40,$inp));
+	&movdqu	($inout5,&QWP(0x50,$inp));
+
+	&call	("_aesni_decrypt6");
+
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($rndkey1,&QWP(0x40,$inp));
+	&xorps	($inout4,$rndkey0);
+	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
+	&xorps	($inout5,$rndkey1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($inp,&DWP(0x60,$inp));
+	&movups	(&QWP(0x20,$out),$inout2);
+	&mov	($rounds,$rounds_)		# restore $rounds
+	&movups	(&QWP(0x30,$out),$inout3);
+	&mov	($key,$key_);			# restore $key
+	&movups	(&QWP(0x40,$out),$inout4);
+	&lea	($out,&DWP(0x50,$out));
+	&sub	($len,0x60);
+	&ja	(&label("cbc_dec_loop6"));
+
+	&movaps	($inout0,$inout5);
+	&movaps	($ivec,$rndkey0);
+	&add	($len,0x50);
+	&jle	(&label("cbc_dec_tail_collected"));
+	&movups	(&QWP(0,$out),$inout0);
+	&lea	($out,&DWP(0x10,$out));
+&set_label("cbc_dec_tail");
+	&movups	($inout0,&QWP(0,$inp));
+	&movaps	($in0,$inout0);
+	&cmp	($len,0x10);
+	&jbe	(&label("cbc_dec_one"));
+
+	&movups	($inout1,&QWP(0x10,$inp));
+	&movaps	($in1,$inout1);
+	&cmp	($len,0x20);
+	&jbe	(&label("cbc_dec_two"));
+
+	&movups	($inout2,&QWP(0x20,$inp));
+	&cmp	($len,0x30);
+	&jbe	(&label("cbc_dec_three"));
+
+	&movups	($inout3,&QWP(0x30,$inp));
+	&cmp	($len,0x40);
+	&jbe	(&label("cbc_dec_four"));
+
+	&movups	($inout4,&QWP(0x40,$inp));
+	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
+	&movups	($inout0,&QWP(0,$inp));
+	&xorps	($inout5,$inout5);
+	&call	("_aesni_decrypt6");
+	&movups	($rndkey1,&QWP(0,$inp));
+	&movups	($rndkey0,&QWP(0x10,$inp));
+	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
+	&xorps	($inout1,$rndkey1);
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&xorps	($inout2,$rndkey0);
+	&movups	($rndkey0,&QWP(0x30,$inp));
+	&xorps	($inout3,$rndkey1);
+	&movups	($ivec,&QWP(0x40,$inp));	# IV
+	&xorps	($inout4,$rndkey0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&movups	(&QWP(0x30,$out),$inout3);
+	&lea	($out,&DWP(0x40,$out));
+	&movaps	($inout0,$inout4);
+	&sub	($len,0x50);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("dec");	}
+	else
+	{   &call	("_aesni_decrypt1");	}
+	&xorps	($inout0,$ivec);
+	&movaps	($ivec,$in0);
+	&sub	($len,0x10);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_two",16);
+	&xorps	($inout2,$inout2);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout1);
+	&lea	($out,&DWP(0x10,$out));
+	&movaps	($ivec,$in1);
+	&sub	($len,0x20);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_three",16);
+	&call	("_aesni_decrypt3");
+	&xorps	($inout0,$ivec);
+	&xorps	($inout1,$in0);
+	&xorps	($inout2,$in1);
+	&movups	(&QWP(0,$out),$inout0);
+	&movaps	($inout0,$inout2);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&lea	($out,&DWP(0x20,$out));
+	&movups	($ivec,&QWP(0x20,$inp));
+	&sub	($len,0x30);
+	&jmp	(&label("cbc_dec_tail_collected"));
+
+&set_label("cbc_dec_four",16);
+	&call	("_aesni_decrypt4");
+	&movups	($rndkey1,&QWP(0x10,$inp));
+	&movups	($rndkey0,&QWP(0x20,$inp));
+	&xorps	($inout0,$ivec);
+	&movups	($ivec,&QWP(0x30,$inp));
+	&xorps	($inout1,$in0);
+	&movups	(&QWP(0,$out),$inout0);
+	&xorps	($inout2,$rndkey1);
+	&movups	(&QWP(0x10,$out),$inout1);
+	&xorps	($inout3,$rndkey0);
+	&movups	(&QWP(0x20,$out),$inout2);
+	&lea	($out,&DWP(0x30,$out));
+	&movaps	($inout0,$inout3);
+	&sub	($len,0x40);
+
+&set_label("cbc_dec_tail_collected");
+	&and	($len,15);
+	&jnz	(&label("cbc_dec_tail_partial"));
+	&movups	(&QWP(0,$out),$inout0);
+	&jmp	(&label("cbc_ret"));
+
+&set_label("cbc_dec_tail_partial",16);
+	&movaps	(&QWP(0,"esp"),$inout0);
+	&mov	("ecx",16);
+	&mov	($inp,"esp");
+	&sub	("ecx",$len);
+	&data_word(0xA4F3F689);		# rep movsb
+
+&set_label("cbc_ret");
+	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
+	&mov	($key_,&wparam(4));
+	&movups	(&QWP(0,$key_),$ivec);	# output IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+######################################################################
+# Mechanical port from aesni-x86_64.pl.
+#
+# _aesni_set_encrypt_key is private interface,
+# input:
+#	"eax"	const unsigned char *userKey
+#	$rounds	int bits
+#	$key	AES_KEY *key
+# output:
+#	"eax"	return code
+#	$round	rounds
+
+&function_begin_B("_aesni_set_encrypt_key");
+	&test	("eax","eax");
+	&jz	(&label("bad_pointer"));
+	&test	($key,$key);
+	&jz	(&label("bad_pointer"));
+
+	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
+	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
+	&lea	($key,&DWP(16,$key));
+	&cmp	($rounds,256);
+	&je	(&label("14rounds"));
+	&cmp	($rounds,192);
+	&je	(&label("12rounds"));
+	&cmp	($rounds,128);
+	&jne	(&label("bad_keybits"));
+
+&set_label("10rounds",16);
+	&mov		($rounds,9);
+	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
+	&call		(&label("key_128_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
+	&call		(&label("key_128"));
+	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
+	&call		(&label("key_128"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(80,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_128",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_128_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("12rounds",16);
+	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
+	&mov		($rounds,11);
+	&$movekey	(&QWP(-16,$key),"xmm0")		# round 0
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
+	&call		(&label("key_192a_cold"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
+	&call		(&label("key_192b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
+	&call		(&label("key_192a"));
+	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
+	&call		(&label("key_192b"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(48,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_192a",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_192a_cold",16);
+	&movaps		("xmm5","xmm2");
+&set_label("key_192b_warm");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&movdqa		("xmm3","xmm2");
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&pslldq		("xmm3",4);
+	&xorps		("xmm0","xmm4");
+	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
+	&pxor		("xmm2","xmm3");
+	&pxor		("xmm0","xmm1");
+	&pshufd		("xmm3","xmm0",0b11111111);
+	&pxor		("xmm2","xmm3");
+	&ret();
+
+&set_label("key_192b",16);
+	&movaps		("xmm3","xmm0");
+	&shufps		("xmm5","xmm0",0b01000100);
+	&$movekey	(&QWP(0,$key),"xmm5");
+	&shufps		("xmm3","xmm2",0b01001110);
+	&$movekey	(&QWP(16,$key),"xmm3");
+	&lea		($key,&DWP(32,$key));
+	&jmp		(&label("key_192b_warm"));
+
+&set_label("14rounds",16);
+	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
+	&mov		($rounds,13);
+	&lea		($key,&DWP(16,$key));
+	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
+	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
+	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
+	&call		(&label("key_256a_cold"));
+	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
+	&call		(&label("key_256a"));
+	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
+	&call		(&label("key_256b"));
+	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
+	&call		(&label("key_256a"));
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&mov		(&DWP(16,$key),$rounds);
+	&xor		("eax","eax");
+	&ret();
+
+&set_label("key_256a",16);
+	&$movekey	(&QWP(0,$key),"xmm2");
+	&lea		($key,&DWP(16,$key));
+&set_label("key_256a_cold");
+	&shufps		("xmm4","xmm0",0b00010000);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm4","xmm0",0b10001100);
+	&xorps		("xmm0","xmm4");
+	&shufps		("xmm1","xmm1",0b11111111);	# critical path
+	&xorps		("xmm0","xmm1");
+	&ret();
+
+&set_label("key_256b",16);
+	&$movekey	(&QWP(0,$key),"xmm0");
+	&lea		($key,&DWP(16,$key));
+
+	&shufps		("xmm4","xmm2",0b00010000);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm4","xmm2",0b10001100);
+	&xorps		("xmm2","xmm4");
+	&shufps		("xmm1","xmm1",0b10101010);	# critical path
+	&xorps		("xmm2","xmm1");
+	&ret();
+
+&set_label("bad_pointer",4);
+	&mov	("eax",-1);
+	&ret	();
+&set_label("bad_keybits",4);
+	&mov	("eax",-2);
+	&ret	();
+&function_end_B("_aesni_set_encrypt_key");
+
+# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_encrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&ret	();
+&function_end_B("${PREFIX}_set_encrypt_key");
+
+# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
+#                              AES_KEY *key)
+&function_begin_B("${PREFIX}_set_decrypt_key");
+	&mov	("eax",&wparam(0));
+	&mov	($rounds,&wparam(1));
+	&mov	($key,&wparam(2));
+	&call	("_aesni_set_encrypt_key");
+	&mov	($key,&wparam(2));
+	&shl	($rounds,4)	# rounds-1 after _aesni_set_encrypt_key
+	&test	("eax","eax");
+	&jnz	(&label("dec_key_ret"));
+	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule
+
+	&$movekey	("xmm0",&QWP(0,$key));	# just swap
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&$movekey	(&QWP(0,"eax"),"xmm0");
+	&$movekey	(&QWP(0,$key),"xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+
+&set_label("dec_key_inverse");
+	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
+	&$movekey	("xmm1",&QWP(0,"eax"));
+	&aesimc		("xmm0","xmm0");
+	&aesimc		("xmm1","xmm1");
+	&lea		($key,&DWP(16,$key));
+	&lea		("eax",&DWP(-16,"eax"));
+	&$movekey	(&QWP(16,"eax"),"xmm0");
+	&$movekey	(&QWP(-16,$key),"xmm1");
+	&cmp		("eax",$key);
+	&ja		(&label("dec_key_inverse"));
+
+	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
+	&aesimc		("xmm0","xmm0");
+	&$movekey	(&QWP(0,$key),"xmm0");
+
+	&xor		("eax","eax");		# return success
+&set_label("dec_key_ret");
+	&ret	();
+&function_end_B("${PREFIX}_set_decrypt_key");
+&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
index 49e0f4b351..499f3b3f42 100644
--- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
+++ b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl
@@ -11,6 +11,151 @@
 # OpenSSL context it's used with Intel engine, but can also be used as
 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
 # details].
+#
+# Performance.
+#
+# Given aes(enc|dec) instructions' latency asymptotic performance for
+# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
+# processed with 128-bit key. And given their throughput asymptotic
+# performance for parallelizable modes is 1.25 cycles per byte. Being
+# asymptotic limit it's not something you commonly achieve in reality,
+# but how close does one get? Below are results collected for
+# different modes and block sized. Pairs of numbers are for en-/
+# decryption.
+#
+#	16-byte     64-byte     256-byte    1-KB        8-KB
+# ECB	4.25/4.25   1.38/1.38   1.28/1.28   1.26/1.26	1.26/1.26
+# CTR	5.42/5.42   1.92/1.92   1.44/1.44   1.28/1.28   1.26/1.26
+# CBC	4.38/4.43   4.15/1.43   4.07/1.32   4.07/1.29   4.06/1.28
+# CCM	5.66/9.42   4.42/5.41   4.16/4.40   4.09/4.15   4.06/4.07   
+# OFB	5.42/5.42   4.64/4.64   4.44/4.44   4.39/4.39   4.38/4.38
+# CFB	5.73/5.85   5.56/5.62   5.48/5.56   5.47/5.55   5.47/5.55
+#
+# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
+# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
+# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
+# The results were collected with specially crafted speed.c benchmark
+# in order to compare them with results reported in "Intel Advanced
+# Encryption Standard (AES) New Instruction Set" White Paper Revision
+# 3.0 dated May 2010. All above results are consistently better. This
+# module also provides better performance for block sizes smaller than
+# 128 bytes in points *not* represented in the above table.
+#
+# Looking at the results for 8-KB buffer.
+#
+# CFB and OFB results are far from the limit, because implementation
+# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
+# single-block aesni_encrypt, which is not the most optimal way to go.
+# CBC encrypt result is unexpectedly high and there is no documented
+# explanation for it. Seemingly there is a small penalty for feeding
+# the result back to AES unit the way it's done in CBC mode. There is
+# nothing one can do and the result appears optimal. CCM result is
+# identical to CBC, because CBC-MAC is essentially CBC encrypt without
+# saving output. CCM CTR "stays invisible," because it's neatly
+# interleaved wih CBC-MAC. This provides ~30% improvement over
+# "straghtforward" CCM implementation with CTR and CBC-MAC performed
+# disjointly. Parallelizable modes practically achieve the theoretical
+# limit.
+#
+# Looking at how results vary with buffer size.
+#
+# Curves are practically saturated at 1-KB buffer size. In most cases
+# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
+# CTR curve doesn't follow this pattern and is "slowest" changing one
+# with "256-byte" result being 87% of "8-KB." This is because overhead
+# in CTR mode is most computationally intensive. Small-block CCM
+# decrypt is slower than encrypt, because first CTR and last CBC-MAC
+# iterations can't be interleaved.
+#
+# Results for 192- and 256-bit keys.
+#
+# EVP-free results were observed to scale perfectly with number of
+# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
+# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
+# are a tad smaller, because the above mentioned penalty biases all
+# results by same constant value. In similar way function call
+# overhead affects small-block performance, as well as OFB and CFB
+# results. Differences are not large, most common coefficients are
+# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
+
+# January 2011
+#
+# While Westmere processor features 6 cycles latency for aes[enc|dec]
+# instructions, which can be scheduled every second cycle, Sandy
+# Bridge spends 8 cycles per instruction, but it can schedule them
+# every cycle. This means that code targeting Westmere would perform
+# suboptimally on Sandy Bridge. Therefore this update.
+#
+# In addition, non-parallelizable CBC encrypt (as well as CCM) is
+# optimized. Relative improvement might appear modest, 8% on Westmere,
+# but in absolute terms it's 3.77 cycles per byte encrypted with
+# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
+# should be compared to asymptotic limits of 3.75 for Westmere and
+# 5.00 for Sandy Bridge. Actually, the fact that they get this close
+# to asymptotic limits is quite amazing. Indeed, the limit is
+# calculated as latency times number of rounds, 10 for 128-bit key,
+# and divided by 16, the number of bytes in block, or in other words
+# it accounts *solely* for aesenc instructions. But there are extra
+# instructions, and numbers so close to the asymptotic limits mean
+# that it's as if it takes as little as *one* additional cycle to
+# execute all of them. How is it possible? It is possible thanks to
+# out-of-order execution logic, which manages to overlap post-
+# processing of previous block, things like saving the output, with
+# actual encryption of current block, as well as pre-processing of
+# current block, things like fetching input and xor-ing it with
+# 0-round element of the key schedule, with actual encryption of
+# previous block. Keep this in mind...
+#
+# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
+# performance is achieved by interleaving instructions working on
+# independent blocks. In which case asymptotic limit for such modes
+# can be obtained by dividing above mentioned numbers by AES
+# instructions' interleave factor. Westmere can execute at most 3 
+# instructions at a time, meaning that optimal interleave factor is 3,
+# and that's where the "magic" number of 1.25 come from. "Optimal
+# interleave factor" means that increase of interleave factor does
+# not improve performance. The formula has proven to reflect reality
+# pretty well on Westmere... Sandy Bridge on the other hand can
+# execute up to 8 AES instructions at a time, so how does varying
+# interleave factor affect the performance? Here is table for ECB
+# (numbers are cycles per byte processed with 128-bit key):
+#
+# instruction interleave factor		3x	6x	8x
+# theoretical asymptotic limit		1.67	0.83	0.625
+# measured performance for 8KB block	1.05	0.86	0.84
+#
+# "as if" interleave factor		4.7x	5.8x	6.0x
+#
+# Further data for other parallelizable modes:
+#
+# CBC decrypt				1.16	0.93	0.93
+# CTR					1.14	0.91	n/a
+#
+# Well, given 3x column it's probably inappropriate to call the limit
+# asymptotic, if it can be surpassed, isn't it? What happens there?
+# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
+# magic is responsible for this. Processor overlaps not only the
+# additional instructions with AES ones, but even AES instuctions
+# processing adjacent triplets of independent blocks. In the 6x case
+# additional instructions  still claim disproportionally small amount
+# of additional cycles, but in 8x case number of instructions must be
+# a tad too high for out-of-order logic to cope with, and AES unit
+# remains underutilized... As you can see 8x interleave is hardly
+# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
+# utilizies 6x interleave because of limited register bank capacity.
+#
+# Higher interleave factors do have negative impact on Westmere
+# performance. While for ECB mode it's negligible ~1.5%, other
+# parallelizables perform ~5% worse, which is outweighed by ~25%
+# improvement on Sandy Bridge. To balance regression on Westmere
+# CTR mode was implemented with 6x aesenc interleave factor.
+
+# April 2011
+#
+# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
+# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
+# in CTR mode AES instruction interleave factor was chosen to be 6x.
 
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
@@ -29,7 +174,7 @@ die "can't locate x86_64-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour $output";
 
-$movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
+$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
 @_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
@@ -41,18 +186,20 @@ $inp="%rdi";
 $out="%rsi";
 $len="%rdx";
 $key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
-$ivp="%r8";	# cbc
+$ivp="%r8";	# cbc, ctr, ...
 
 $rnds_="%r10d";	# backup copy for $rounds
 $key_="%r11";	# backup copy for $key
 
 # %xmm register layout
-$inout0="%xmm0";	$inout1="%xmm1";
-$inout2="%xmm2";	$inout3="%xmm3";
-$rndkey0="%xmm4";	$rndkey1="%xmm5";
-
-$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt
-$in1="%xmm8";		$in2="%xmm9";
+$rndkey0="%xmm0";	$rndkey1="%xmm1";
+$inout0="%xmm2";	$inout1="%xmm3";
+$inout2="%xmm4";	$inout3="%xmm5";
+$inout4="%xmm6";	$inout5="%xmm7";
+$inout6="%xmm8";	$inout7="%xmm9";
+
+$in2="%xmm6";		$in1="%xmm7";	# used in CBC decrypt, CTR, ...
+$in0="%xmm8";		$iv="%xmm9";
 
 # Inline version of internal aesni_[en|de]crypt1.
 #
@@ -60,20 +207,29 @@ $in1="%xmm8";		$in2="%xmm9";
 # cycles which take care of loop variables...
 { my $sn;
 sub aesni_generate1 {
-my ($p,$key,$rounds)=@_;
+my ($p,$key,$rounds,$inout,$ivec)=@_;	$inout=$inout0 if (!defined($inout));
 ++$sn;
 $code.=<<___;
 	$movkey	($key),$rndkey0
 	$movkey	16($key),$rndkey1
+___
+$code.=<<___ if (defined($ivec));
+	xorps	$rndkey0,$ivec
+	lea	32($key),$key
+	xorps	$ivec,$inout
+___
+$code.=<<___ if (!defined($ivec));
 	lea	32($key),$key
-	pxor	$rndkey0,$inout0
+	xorps	$rndkey0,$inout
+___
+$code.=<<___;
 .Loop_${p}1_$sn:
-	aes${p}	$rndkey1,$inout0
+	aes${p}	$rndkey1,$inout
 	dec	$rounds
 	$movkey	($key),$rndkey1
 	lea	16($key),$key
 	jnz	.Loop_${p}1_$sn	# loop body is 16 bytes
-	aes${p}last	$rndkey1,$inout0
+	aes${p}last	$rndkey1,$inout
 ___
 }}
 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
@@ -86,7 +242,7 @@ $code.=<<___;
 .align	16
 ${PREFIX}_encrypt:
 	movups	($inp),$inout0		# load input
-	mov	240($key),$rounds	# pull $rounds
+	mov	240($key),$rounds	# key->rounds
 ___
 	&aesni_generate1("enc",$key,$rounds);
 $code.=<<___;
@@ -99,7 +255,7 @@ $code.=<<___;
 .align	16
 ${PREFIX}_decrypt:
 	movups	($inp),$inout0		# load input
-	mov	240($key),$rounds	# pull $rounds
+	mov	240($key),$rounds	# key->rounds
 ___
 	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
@@ -109,16 +265,16 @@ $code.=<<___;
 ___
 }
 
-# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
-# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
-# latency is 6, it turned out that it can be scheduled only every
-# *second* cycle. Thus 3x interleave is the one providing optimal
+# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
+# factor. Why 3x subroutine were originally used in loops? Even though
+# aes[enc|dec] latency was originally 6, it could be scheduled only
+# every *2nd* cycle. Thus 3x interleave was the one providing optimal
 # utilization, i.e. when subroutine's throughput is virtually same as
 # of non-interleaved subroutine [for number of input blocks up to 3].
-# This is why it makes no sense to implement 2x subroutine. As soon
-# as/if Intel improves throughput by making it possible to schedule
-# the instructions in question *every* cycles I would have to
-# implement 6x interleave and use it in loop...
+# This is why it makes no sense to implement 2x subroutine.
+# aes[enc|dec] latency in next processor generation is 8, but the
+# instructions can be scheduled every cycle. Optimal interleave for
+# new processor is therefore 8x...
 sub aesni_generate3 {
 my $dir=shift;
 # As already mentioned it takes in $key and $rounds, which are *not*
@@ -131,25 +287,25 @@ _aesni_${dir}rypt3:
 	shr	\$1,$rounds
 	$movkey	16($key),$rndkey1
 	lea	32($key),$key
-	pxor	$rndkey0,$inout0
-	pxor	$rndkey0,$inout1
-	pxor	$rndkey0,$inout2
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
 
 .L${dir}_loop3:
 	aes${dir}	$rndkey1,$inout0
-	$movkey		($key),$rndkey0
 	aes${dir}	$rndkey1,$inout1
 	dec		$rounds
 	aes${dir}	$rndkey1,$inout2
-	aes${dir}	$rndkey0,$inout0
 	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
 	aes${dir}	$rndkey0,$inout1
 	lea		32($key),$key
 	aes${dir}	$rndkey0,$inout2
+	$movkey		($key),$rndkey0
 	jnz		.L${dir}_loop3
 
 	aes${dir}	$rndkey1,$inout0
-	$movkey		($key),$rndkey0
 	aes${dir}	$rndkey1,$inout1
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}last	$rndkey0,$inout0
@@ -175,28 +331,28 @@ _aesni_${dir}rypt4:
 	shr	\$1,$rounds
 	$movkey	16($key),$rndkey1
 	lea	32($key),$key
-	pxor	$rndkey0,$inout0
-	pxor	$rndkey0,$inout1
-	pxor	$rndkey0,$inout2
-	pxor	$rndkey0,$inout3
+	xorps	$rndkey0,$inout0
+	xorps	$rndkey0,$inout1
+	xorps	$rndkey0,$inout2
+	xorps	$rndkey0,$inout3
+	$movkey	($key),$rndkey0
 
 .L${dir}_loop4:
 	aes${dir}	$rndkey1,$inout0
-	$movkey		($key),$rndkey0
 	aes${dir}	$rndkey1,$inout1
 	dec		$rounds
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}	$rndkey1,$inout3
-	aes${dir}	$rndkey0,$inout0
 	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
 	aes${dir}	$rndkey0,$inout1
 	lea		32($key),$key
 	aes${dir}	$rndkey0,$inout2
 	aes${dir}	$rndkey0,$inout3
+	$movkey		($key),$rndkey0
 	jnz		.L${dir}_loop4
 
 	aes${dir}	$rndkey1,$inout0
-	$movkey		($key),$rndkey0
 	aes${dir}	$rndkey1,$inout1
 	aes${dir}	$rndkey1,$inout2
 	aes${dir}	$rndkey1,$inout3
@@ -208,12 +364,158 @@ _aesni_${dir}rypt4:
 .size	_aesni_${dir}rypt4,.-_aesni_${dir}rypt4
 ___
 }
+sub aesni_generate6 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-5] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt6,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt6:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout5
+	jmp		.L${dir}_loop6_enter
+.align	16
+.L${dir}_loop6:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+.L${dir}_loop6_enter:				# happens to be 16-byte aligned
+	$movkey		16($key),$rndkey1
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop6
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	ret
+.size	_aesni_${dir}rypt6,.-_aesni_${dir}rypt6
+___
+}
+sub aesni_generate8 {
+my $dir=shift;
+# As already mentioned it takes in $key and $rounds, which are *not*
+# preserved. $inout[0-7] is cipher/clear text...
+$code.=<<___;
+.type	_aesni_${dir}rypt8,\@abi-omnipotent
+.align	16
+_aesni_${dir}rypt8:
+	$movkey		($key),$rndkey0
+	shr		\$1,$rounds
+	$movkey		16($key),$rndkey1
+	lea		32($key),$key
+	xorps		$rndkey0,$inout0
+	xorps		$rndkey0,$inout1
+	aes${dir}	$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	aes${dir}	$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	aes${dir}	$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	aes${dir}	$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aes${dir}	$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+	jmp		.L${dir}_loop8_enter
+.align	16
+.L${dir}_loop8:
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	dec		$rounds
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
+.L${dir}_loop8_enter:				# happens to be 16-byte aligned
+	aes${dir}	$rndkey0,$inout0
+	aes${dir}	$rndkey0,$inout1
+	lea		32($key),$key
+	aes${dir}	$rndkey0,$inout2
+	aes${dir}	$rndkey0,$inout3
+	aes${dir}	$rndkey0,$inout4
+	aes${dir}	$rndkey0,$inout5
+	aes${dir}	$rndkey0,$inout6
+	aes${dir}	$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	jnz		.L${dir}_loop8
+
+	aes${dir}	$rndkey1,$inout0
+	aes${dir}	$rndkey1,$inout1
+	aes${dir}	$rndkey1,$inout2
+	aes${dir}	$rndkey1,$inout3
+	aes${dir}	$rndkey1,$inout4
+	aes${dir}	$rndkey1,$inout5
+	aes${dir}	$rndkey1,$inout6
+	aes${dir}	$rndkey1,$inout7
+	aes${dir}last	$rndkey0,$inout0
+	aes${dir}last	$rndkey0,$inout1
+	aes${dir}last	$rndkey0,$inout2
+	aes${dir}last	$rndkey0,$inout3
+	aes${dir}last	$rndkey0,$inout4
+	aes${dir}last	$rndkey0,$inout5
+	aes${dir}last	$rndkey0,$inout6
+	aes${dir}last	$rndkey0,$inout7
+	ret
+.size	_aesni_${dir}rypt8,.-_aesni_${dir}rypt8
+___
+}
 &aesni_generate3("enc") if ($PREFIX eq "aesni");
 &aesni_generate3("dec");
 &aesni_generate4("enc") if ($PREFIX eq "aesni");
 &aesni_generate4("dec");
+&aesni_generate6("enc") if ($PREFIX eq "aesni");
+&aesni_generate6("dec");
+&aesni_generate8("enc") if ($PREFIX eq "aesni");
+&aesni_generate8("dec");
 
 if ($PREFIX eq "aesni") {
+########################################################################
 # void aesni_ecb_encrypt (const void *in, void *out,
 #			  size_t length, const AES_KEY *key,
 #			  int enc);
@@ -222,54 +524,98 @@ $code.=<<___;
 .type	aesni_ecb_encrypt,\@function,5
 .align	16
 aesni_ecb_encrypt:
-	cmp	\$16,$len		# check length
-	jb	.Lecb_ret
-
-	mov	240($key),$rounds	# pull $rounds
 	and	\$-16,$len
+	jz	.Lecb_ret
+
+	mov	240($key),$rounds	# key->rounds
+	$movkey	($key),$rndkey0
 	mov	$key,$key_		# backup $key
-	test	%r8d,%r8d		# 5th argument
 	mov	$rounds,$rnds_		# backup $rounds
+	test	%r8d,%r8d		# 5th argument
 	jz	.Lecb_decrypt
 #--------------------------- ECB ENCRYPT ------------------------------#
-	sub	\$0x40,$len
-	jbe	.Lecb_enc_tail
-	jmp	.Lecb_enc_loop3
+	cmp	\$0x80,$len
+	jb	.Lecb_enc_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_enc_loop8_enter
 .align 16
-.Lecb_enc_loop3:
-	movups	($inp),$inout0
-	movups	0x10($inp),$inout1
-	movups	0x20($inp),$inout2
-	call	_aesni_encrypt3
-	sub	\$0x30,$len
-	lea	0x30($inp),$inp
-	lea	0x30($out),$out
-	movups	$inout0,-0x30($out)
-	mov	$rnds_,$rounds		# restore $rounds
-	movups	$inout1,-0x20($out)
+.Lecb_enc_loop8:
+	movups	$inout0,($out)
 	mov	$key_,$key		# restore $key
-	movups	$inout2,-0x10($out)
-	ja	.Lecb_enc_loop3
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_enc_loop8_enter:
+
+	call	_aesni_encrypt8
+
+	sub	\$0x80,$len
+	jnc	.Lecb_enc_loop8
 
-.Lecb_enc_tail:
-	add	\$0x40,$len
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
 	jz	.Lecb_ret
 
-	cmp	\$0x10,$len
+.Lecb_enc_tail:
 	movups	($inp),$inout0
-	je	.Lecb_enc_one
 	cmp	\$0x20,$len
+	jb	.Lecb_enc_one
 	movups	0x10($inp),$inout1
 	je	.Lecb_enc_two
-	cmp	\$0x30,$len
 	movups	0x20($inp),$inout2
-	je	.Lecb_enc_three
+	cmp	\$0x40,$len
+	jb	.Lecb_enc_three
 	movups	0x30($inp),$inout3
-	call	_aesni_encrypt4
+	je	.Lecb_enc_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_enc_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_enc_six
+	movdqu	0x60($inp),$inout6
+	call	_aesni_encrypt8
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
 	movups	$inout2,0x20($out)
 	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_one:
@@ -280,6 +626,7 @@ $code.=<<___;
 	jmp	.Lecb_ret
 .align	16
 .Lecb_enc_two:
+	xorps	$inout2,$inout2
 	call	_aesni_encrypt3
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
@@ -291,42 +638,145 @@ $code.=<<___;
 	movups	$inout1,0x10($out)
 	movups	$inout2,0x20($out)
 	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_four:
+	call	_aesni_encrypt4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_five:
+	xorps	$inout5,$inout5
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_enc_six:
+	call	_aesni_encrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	jmp	.Lecb_ret
 #--------------------------- ECB DECRYPT ------------------------------#
 .align	16
 .Lecb_decrypt:
-	sub	\$0x40,$len
-	jbe	.Lecb_dec_tail
-	jmp	.Lecb_dec_loop3
+	cmp	\$0x80,$len
+	jb	.Lecb_dec_tail
+
+	movdqu	($inp),$inout0
+	movdqu	0x10($inp),$inout1
+	movdqu	0x20($inp),$inout2
+	movdqu	0x30($inp),$inout3
+	movdqu	0x40($inp),$inout4
+	movdqu	0x50($inp),$inout5
+	movdqu	0x60($inp),$inout6
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+	sub	\$0x80,$len
+	jmp	.Lecb_dec_loop8_enter
 .align 16
-.Lecb_dec_loop3:
-	movups	($inp),$inout0
-	movups	0x10($inp),$inout1
-	movups	0x20($inp),$inout2
-	call	_aesni_decrypt3
-	sub	\$0x30,$len
-	lea	0x30($inp),$inp
-	lea	0x30($out),$out
-	movups	$inout0,-0x30($out)
-	mov	$rnds_,$rounds		# restore $rounds
-	movups	$inout1,-0x20($out)
+.Lecb_dec_loop8:
+	movups	$inout0,($out)
 	mov	$key_,$key		# restore $key
-	movups	$inout2,-0x10($out)
-	ja	.Lecb_dec_loop3
+	movdqu	($inp),$inout0
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout1,0x10($out)
+	movdqu	0x10($inp),$inout1
+	movups	$inout2,0x20($out)
+	movdqu	0x20($inp),$inout2
+	movups	$inout3,0x30($out)
+	movdqu	0x30($inp),$inout3
+	movups	$inout4,0x40($out)
+	movdqu	0x40($inp),$inout4
+	movups	$inout5,0x50($out)
+	movdqu	0x50($inp),$inout5
+	movups	$inout6,0x60($out)
+	movdqu	0x60($inp),$inout6
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	movdqu	0x70($inp),$inout7
+	lea	0x80($inp),$inp
+.Lecb_dec_loop8_enter:
+
+	call	_aesni_decrypt8
+
+	$movkey	($key_),$rndkey0
+	sub	\$0x80,$len
+	jnc	.Lecb_dec_loop8
 
-.Lecb_dec_tail:
-	add	\$0x40,$len
+	movups	$inout0,($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout1,0x10($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	movups	$inout7,0x70($out)
+	lea	0x80($out),$out
+	add	\$0x80,$len
 	jz	.Lecb_ret
 
-	cmp	\$0x10,$len
+.Lecb_dec_tail:
 	movups	($inp),$inout0
-	je	.Lecb_dec_one
 	cmp	\$0x20,$len
+	jb	.Lecb_dec_one
 	movups	0x10($inp),$inout1
 	je	.Lecb_dec_two
-	cmp	\$0x30,$len
 	movups	0x20($inp),$inout2
-	je	.Lecb_dec_three
+	cmp	\$0x40,$len
+	jb	.Lecb_dec_three
 	movups	0x30($inp),$inout3
+	je	.Lecb_dec_four
+	movups	0x40($inp),$inout4
+	cmp	\$0x60,$len
+	jb	.Lecb_dec_five
+	movups	0x50($inp),$inout5
+	je	.Lecb_dec_six
+	movups	0x60($inp),$inout6
+	$movkey	($key),$rndkey0
+	call	_aesni_decrypt8
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	movups	$inout6,0x60($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_one:
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	movups	$inout0,($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_two:
+	xorps	$inout2,$inout2
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_three:
+	call	_aesni_decrypt3
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_four:
 	call	_aesni_decrypt4
 	movups	$inout0,($out)
 	movups	$inout1,0x10($out)
@@ -334,35 +784,1343 @@ $code.=<<___;
 	movups	$inout3,0x30($out)
 	jmp	.Lecb_ret
 .align	16
-.Lecb_dec_one:
+.Lecb_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	jmp	.Lecb_ret
+.align	16
+.Lecb_dec_six:
+	call	_aesni_decrypt6
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+
+.Lecb_ret:
+	ret
+.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+___
+
+{
+######################################################################
+# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec,char *cmac);
+#
+# Handles only complete blocks, operates on 64-bit counter and
+# does not update *ivec! Nor does it finalize CMAC value
+# (see engine/eng_aesni.c for details)
+#
+{
+my $cmac="%r9";	# 6th argument
+
+my $increment="%xmm6";
+my $bswap_mask="%xmm7";
+
+$code.=<<___;
+.globl	aesni_ccm64_encrypt_blocks
+.type	aesni_ccm64_encrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_enc_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movdqu	($ivp),$iv
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	shr	\$1,$rounds
+	lea	0($key),$key_
+	movdqu	($cmac),$inout1
+	movdqa	$iv,$inout0
+	mov	$rounds,$rnds_
+	pshufb	$bswap_mask,$iv
+	jmp	.Lccm64_enc_outer
+.align	16
+.Lccm64_enc_outer:
+	$movkey	($key_),$rndkey0
+	mov	$rnds_,$rounds
+	movups	($inp),$in0			# load inp
+
+	xorps	$rndkey0,$inout0		# counter
+	$movkey	16($key_),$rndkey1
+	xorps	$in0,$rndkey0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout1		# cmac^=inp
+	$movkey	($key),$rndkey0
+
+.Lccm64_enc2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_enc2_loop
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	paddq	$increment,$iv
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+
+	dec	$len
+	lea	16($inp),$inp
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+	jnz	.Lccm64_enc_outer
+
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_enc_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
+___
+######################################################################
+$code.=<<___;
+.globl	aesni_ccm64_decrypt_blocks
+.type	aesni_ccm64_decrypt_blocks,\@function,6
+.align	16
+aesni_ccm64_decrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x58(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+.Lccm64_dec_body:
+___
+$code.=<<___;
+	mov	240($key),$rounds		# key->rounds
+	movups	($ivp),$iv
+	movdqu	($cmac),$inout1
+	movdqa	.Lincrement64(%rip),$increment
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+
+	movaps	$iv,$inout0
+	mov	$rounds,$rnds_
+	mov	$key,$key_
+	pshufb	$bswap_mask,$iv
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	lea	16($inp),$inp
+	jmp	.Lccm64_dec_outer
+.align	16
+.Lccm64_dec_outer:
+	xorps	$inout0,$in0			# inp ^= E(iv)
+	movdqa	$iv,$inout0
+	mov	$rnds_,$rounds
+	movups	$in0,($out)			# save output
+	lea	16($out),$out
+	pshufb	$bswap_mask,$inout0
+
+	sub	\$1,$len
+	jz	.Lccm64_dec_break
+
+	$movkey	($key_),$rndkey0
+	shr	\$1,$rounds
+	$movkey	16($key_),$rndkey1
+	xorps	$rndkey0,$in0
+	lea	32($key_),$key
+	xorps	$rndkey0,$inout0
+	xorps	$in0,$inout1			# cmac^=out
+	$movkey	($key),$rndkey0
+
+.Lccm64_dec2_loop:
+	aesenc	$rndkey1,$inout0
+	dec	$rounds
+	aesenc	$rndkey1,$inout1
+	$movkey	16($key),$rndkey1
+	aesenc	$rndkey0,$inout0
+	lea	32($key),$key
+	aesenc	$rndkey0,$inout1
+	$movkey	0($key),$rndkey0
+	jnz	.Lccm64_dec2_loop
+	movups	($inp),$in0			# load inp
+	paddq	$increment,$iv
+	aesenc	$rndkey1,$inout0
+	aesenc	$rndkey1,$inout1
+	lea	16($inp),$inp
+	aesenclast	$rndkey0,$inout0
+	aesenclast	$rndkey0,$inout1
+	jmp	.Lccm64_dec_outer
+
+.align	16
+.Lccm64_dec_break:
+	#xorps	$in0,$inout1			# cmac^=out
+___
+	&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
+$code.=<<___;
+	movups	$inout1,($cmac)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	lea	0x58(%rsp),%rsp
+.Lccm64_dec_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
+___
+}
+######################################################################
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+#
+# Handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+{
+my $reserved = $win64?0:-0x28;
+my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11));
+my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14");
+my $bswap_mask="%xmm15";
+
+$code.=<<___;
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,\@function,5
+.align	16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0xc8(%rsp),%rsp
+	movaps	%xmm6,0x20(%rsp)
+	movaps	%xmm7,0x30(%rsp)
+	movaps	%xmm8,0x40(%rsp)
+	movaps	%xmm9,0x50(%rsp)
+	movaps	%xmm10,0x60(%rsp)
+	movaps	%xmm11,0x70(%rsp)
+	movaps	%xmm12,0x80(%rsp)
+	movaps	%xmm13,0x90(%rsp)
+	movaps	%xmm14,0xa0(%rsp)
+	movaps	%xmm15,0xb0(%rsp)
+.Lctr32_body:
+___
+$code.=<<___;
+	cmp	\$1,$len
+	je	.Lctr32_one_shortcut
+
+	movdqu	($ivp),$ivec
+	movdqa	.Lbswap_mask(%rip),$bswap_mask
+	xor	$rounds,$rounds
+	pextrd	\$3,$ivec,$rnds_		# pull 32-bit counter
+	pinsrd	\$3,$rounds,$ivec		# wipe 32-bit counter
+
+	mov	240($key),$rounds		# key->rounds
+	bswap	$rnds_
+	pxor	$iv0,$iv0			# vector of 3 32-bit counters
+	pxor	$iv1,$iv1			# vector of 3 32-bit counters
+	pinsrd	\$0,$rnds_,$iv0
+	lea	3($rnds_),$key_
+	pinsrd	\$0,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$1,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$1,$key_,$iv1
+	inc	$rnds_
+	pinsrd	\$2,$rnds_,$iv0
+	inc	$key_
+	pinsrd	\$2,$key_,$iv1
+	movdqa	$iv0,$reserved(%rsp)
+	pshufb	$bswap_mask,$iv0
+	movdqa	$iv1,`$reserved+0x10`(%rsp)
+	pshufb	$bswap_mask,$iv1
+
+	pshufd	\$`3<<6`,$iv0,$inout0		# place counter to upper dword
+	pshufd	\$`2<<6`,$iv0,$inout1
+	pshufd	\$`1<<6`,$iv0,$inout2
+	cmp	\$6,$len
+	jb	.Lctr32_tail
+	shr	\$1,$rounds
+	mov	$key,$key_			# backup $key
+	mov	$rounds,$rnds_			# backup $rounds
+	sub	\$6,$len
+	jmp	.Lctr32_loop6
+
+.align	16
+.Lctr32_loop6:
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout0			# merge counter-less ivec
+	 $movkey	($key_),$rndkey0
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout1
+	 $movkey	16($key_),$rndkey1
+	pshufd	\$`1<<6`,$iv1,$inout5
+	por	$ivec,$inout2
+	por	$ivec,$inout3
+	 xorps		$rndkey0,$inout0
+	por	$ivec,$inout4
+	por	$ivec,$inout5
+
+	# inline _aesni_encrypt6 and interleave last rounds
+	# with own code...
+
+	pxor		$rndkey0,$inout1
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	aesenc		$rndkey1,$inout1
+	 movdqa		.Lincrement32(%rip),$iv1
+	pxor		$rndkey0,$inout3
+	aesenc		$rndkey1,$inout2
+	 movdqa		$reserved(%rsp),$iv0
+	pxor		$rndkey0,$inout4
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+	jmp		.Lctr32_enc_loop6_enter
+.align	16
+.Lctr32_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lctr32_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lctr32_enc_loop6
+
+	aesenc		$rndkey1,$inout0
+	 paddd		$iv1,$iv0		# increment counter vector
+	aesenc		$rndkey1,$inout1
+	 paddd		`$reserved+0x10`(%rsp),$iv1
+	aesenc		$rndkey1,$inout2
+	 movdqa		$iv0,$reserved(%rsp)	# save counter vector
+	aesenc		$rndkey1,$inout3
+	 movdqa		$iv1,`$reserved+0x10`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 pshufb		$bswap_mask,$iv0	# byte swap
+	aesenc		$rndkey1,$inout5
+	 pshufb		$bswap_mask,$iv1
+
+	aesenclast	$rndkey0,$inout0
+	 movups		($inp),$in0		# load input
+	aesenclast	$rndkey0,$inout1
+	 movups		0x10($inp),$in1
+	aesenclast	$rndkey0,$inout2
+	 movups		0x20($inp),$in2
+	aesenclast	$rndkey0,$inout3
+	 movups		0x30($inp),$in3
+	aesenclast	$rndkey0,$inout4
+	 movups		0x40($inp),$rndkey1
+	aesenclast	$rndkey0,$inout5
+	 movups		0x50($inp),$rndkey0
+	 lea	0x60($inp),$inp
+
+	xorps	$inout0,$in0			# xor
+	 pshufd	\$`3<<6`,$iv0,$inout0
+	xorps	$inout1,$in1
+	 pshufd	\$`2<<6`,$iv0,$inout1
+	movups	$in0,($out)			# store output
+	xorps	$inout2,$in2
+	 pshufd	\$`1<<6`,$iv0,$inout2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	xorps	$inout5,$rndkey0
+	movups	$rndkey1,0x40($out)
+	movups	$rndkey0,0x50($out)
+	lea	0x60($out),$out
+	mov	$rnds_,$rounds
+	sub	\$6,$len
+	jnc	.Lctr32_loop6
+
+	add	\$6,$len
+	jz	.Lctr32_done
+	mov	$key_,$key			# restore $key
+	lea	1($rounds,$rounds),$rounds	# restore original value
+
+.Lctr32_tail:
+	por	$ivec,$inout0
+	movups	($inp),$in0
+	cmp	\$2,$len
+	jb	.Lctr32_one
+
+	por	$ivec,$inout1
+	movups	0x10($inp),$in1
+	je	.Lctr32_two
+
+	pshufd	\$`3<<6`,$iv1,$inout3
+	por	$ivec,$inout2
+	movups	0x20($inp),$in2
+	cmp	\$4,$len
+	jb	.Lctr32_three
+
+	pshufd	\$`2<<6`,$iv1,$inout4
+	por	$ivec,$inout3
+	movups	0x30($inp),$in3
+	je	.Lctr32_four
+
+	por	$ivec,$inout4
+	xorps	$inout5,$inout5
+
+	call	_aesni_encrypt6
+
+	movups	0x40($inp),$rndkey1
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	xorps	$inout4,$rndkey1
+	movups	$in3,0x30($out)
+	movups	$rndkey1,0x40($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one_shortcut:
+	movups	($ivp),$inout0
+	movups	($inp),$in0
+	mov	240($key),$rounds		# key->rounds
+.Lctr32_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	$inout0,$in0
+	movups	$in0,($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	xorps	$inout2,$inout2
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	movups	$in2,0x20($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_four:
+	call	_aesni_encrypt4
+	xorps	$inout0,$in0
+	xorps	$inout1,$in1
+	movups	$in0,($out)
+	xorps	$inout2,$in2
+	movups	$in1,0x10($out)
+	xorps	$inout3,$in3
+	movups	$in2,0x20($out)
+	movups	$in3,0x30($out)
+
+.Lctr32_done:
+___
+$code.=<<___ if ($win64);
+	movaps	0x20(%rsp),%xmm6
+	movaps	0x30(%rsp),%xmm7
+	movaps	0x40(%rsp),%xmm8
+	movaps	0x50(%rsp),%xmm9
+	movaps	0x60(%rsp),%xmm10
+	movaps	0x70(%rsp),%xmm11
+	movaps	0x80(%rsp),%xmm12
+	movaps	0x90(%rsp),%xmm13
+	movaps	0xa0(%rsp),%xmm14
+	movaps	0xb0(%rsp),%xmm15
+	lea	0xc8(%rsp),%rsp
+.Lctr32_ret:
+___
+$code.=<<___;
+	ret
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
+}
+
+######################################################################
+# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2
+#	const unsigned char iv[16]);
+#
+{
+my @tweak=map("%xmm$_",(10..15));
+my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
+my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
+my $frame_size = 0x68 + ($win64?160:0);
+
+$code.=<<___;
+.globl	aesni_xts_encrypt
+.type	aesni_xts_encrypt,\@function,6
+.align	16
+aesni_xts_encrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240(%r8),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_enc_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_enc_grandloop
+
+.align	16
+.Lxts_enc_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_encrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesenc		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesenc		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesenc		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesenc		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesenc		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesenc		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_enc_loop6_enter
+
+.align	16
+.Lxts_enc_loop6:
+	aesenc		$rndkey1,$inout0
+	aesenc		$rndkey1,$inout1
+	dec		$rounds
+	aesenc		$rndkey1,$inout2
+	aesenc		$rndkey1,$inout3
+	aesenc		$rndkey1,$inout4
+	aesenc		$rndkey1,$inout5
+.Lxts_enc_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesenc		$rndkey0,$inout0
+	aesenc		$rndkey0,$inout1
+	lea		32($key),$key
+	aesenc		$rndkey0,$inout2
+	aesenc		$rndkey0,$inout3
+	aesenc		$rndkey0,$inout4
+	aesenc		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_enc_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey0,$inout3
+	 aesenc		$rndkey0,$inout4
+	 aesenc		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenc		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenc		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenc		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenc		$rndkey1,$inout3
+	 aesenc		$rndkey1,$inout4
+	 aesenc		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesenclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesenclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesenclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesenclast	$rndkey0,$inout3
+	 aesenclast	$rndkey0,$inout4
+	 aesenclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_enc_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_enc_short:
+	add	\$16*6,$len
+	jz	.Lxts_enc_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_enc_one
+	je	.Lxts_enc_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_enc_three
+	je	.Lxts_enc_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_encrypt6
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	movdqu	$inout3,16*3($out)
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
+	movups	$inout0,($out)
+	lea	16*1($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
+	call	_aesni_encrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_four:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_encrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[5],@tweak[0]
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_enc_done
+
+.align	16
+.Lxts_enc_done:
+	and	\$15,$len_
+	jz	.Lxts_enc_ret
+	mov	$len_,$len
+
+.Lxts_enc_steal:
+	movzb	($inp),%eax			# borrow $rounds ...
+	movzb	-16($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,-16($out)
+	mov	%cl,0($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_enc_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	-16($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,-16($out)
+
+.Lxts_enc_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_enc_epilogue:
+	ret
+.size	aesni_xts_encrypt,.-aesni_xts_encrypt
+___
+
+$code.=<<___;
+.globl	aesni_xts_decrypt
+.type	aesni_xts_decrypt,\@function,6
+.align	16
+aesni_xts_decrypt:
+	lea	-$frame_size(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,0x60(%rsp)
+	movaps	%xmm7,0x70(%rsp)
+	movaps	%xmm8,0x80(%rsp)
+	movaps	%xmm9,0x90(%rsp)
+	movaps	%xmm10,0xa0(%rsp)
+	movaps	%xmm11,0xb0(%rsp)
+	movaps	%xmm12,0xc0(%rsp)
+	movaps	%xmm13,0xd0(%rsp)
+	movaps	%xmm14,0xe0(%rsp)
+	movaps	%xmm15,0xf0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	movups	($ivp),@tweak[5]		# load clear-text tweak
+	mov	240($key2),$rounds		# key2->rounds
+	mov	240($key),$rnds_		# key1->rounds
+___
+	# generate the tweak
+	&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
+$code.=<<___;
+	xor	%eax,%eax			# if ($len%16) len-=16;
+	test	\$15,$len
+	setnz	%al
+	shl	\$4,%rax
+	sub	%rax,$len
+
+	mov	$key,$key_			# backup $key
+	mov	$rnds_,$rounds			# backup $rounds
+	mov	$len,$len_			# backup $len
+	and	\$-16,$len
+
+	movdqa	.Lxts_magic(%rip),$twmask
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+___
+    for ($i=0;$i<4;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[$i]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+___
+    }
+$code.=<<___;
+	sub	\$16*6,$len
+	jc	.Lxts_dec_short
+
+	shr	\$1,$rounds
+	sub	\$1,$rounds
+	mov	$rounds,$rnds_
+	jmp	.Lxts_dec_grandloop
+
+.align	16
+.Lxts_dec_grandloop:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	movdqu	`16*0`($inp),$inout0		# load input
+	pand	$twmask,$twres			# isolate carry and residue
+	movdqu	`16*1`($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	`16*2`($inp),$inout2
+	pxor	@tweak[0],$inout0		# input^=tweak
+	movdqu	`16*3`($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	`16*4`($inp),$inout4
+	pxor	@tweak[2],$inout2
+	movdqu	`16*5`($inp),$inout5
+	lea	`16*6`($inp),$inp
+	pxor	@tweak[3],$inout3
+	$movkey		($key_),$rndkey0
+	pxor	@tweak[4],$inout4
+	pxor	@tweak[5],$inout5
+
+	# inline _aesni_decrypt6 and interleave first and last rounds
+	# with own code...
+	$movkey		16($key_),$rndkey1
+	pxor		$rndkey0,$inout0
+	pxor		$rndkey0,$inout1
+	 movdqa	@tweak[0],`16*0`(%rsp)		# put aside tweaks
+	aesdec		$rndkey1,$inout0
+	lea		32($key_),$key
+	pxor		$rndkey0,$inout2
+	 movdqa	@tweak[1],`16*1`(%rsp)
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	 movdqa	@tweak[2],`16*2`(%rsp)
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	 movdqa	@tweak[3],`16*3`(%rsp)
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	dec		$rounds
+	 movdqa	@tweak[4],`16*4`(%rsp)
+	aesdec		$rndkey1,$inout4
+	 movdqa	@tweak[5],`16*5`(%rsp)
+	aesdec		$rndkey1,$inout5
+	pxor	$twtmp,$twtmp
+	pcmpgtd	@tweak[5],$twtmp
+	jmp		.Lxts_dec_loop6_enter
+
+.align	16
+.Lxts_dec_loop6:
+	aesdec		$rndkey1,$inout0
+	aesdec		$rndkey1,$inout1
+	dec		$rounds
+	aesdec		$rndkey1,$inout2
+	aesdec		$rndkey1,$inout3
+	aesdec		$rndkey1,$inout4
+	aesdec		$rndkey1,$inout5
+.Lxts_dec_loop6_enter:
+	$movkey		16($key),$rndkey1
+	aesdec		$rndkey0,$inout0
+	aesdec		$rndkey0,$inout1
+	lea		32($key),$key
+	aesdec		$rndkey0,$inout2
+	aesdec		$rndkey0,$inout3
+	aesdec		$rndkey0,$inout4
+	aesdec		$rndkey0,$inout5
+	$movkey		($key),$rndkey0
+	jnz		.Lxts_dec_loop6
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcast upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+	 $movkey	16($key),$rndkey1
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey0,$inout3
+	 aesdec		$rndkey0,$inout4
+	 aesdec		$rndkey0,$inout5
+	 $movkey	32($key),$rndkey0
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[1]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdec		$rndkey1,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdec		$rndkey1,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdec		$rndkey1,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdec		$rndkey1,$inout3
+	 aesdec		$rndkey1,$inout4
+	 aesdec		$rndkey1,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[2]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 aesdeclast	$rndkey0,$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 aesdeclast	$rndkey0,$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	 aesdeclast	$rndkey0,$inout2
+	pxor	$twres,@tweak[5]
+	 aesdeclast	$rndkey0,$inout3
+	 aesdeclast	$rndkey0,$inout4
+	 aesdeclast	$rndkey0,$inout5
+
+	pshufd	\$0x13,$twtmp,$twres
+	pxor	$twtmp,$twtmp
+	movdqa	@tweak[5],@tweak[3]
+	paddq	@tweak[5],@tweak[5]		# psllq	1,$tweak
+	 xorps	`16*0`(%rsp),$inout0		# output^=tweak
+	pand	$twmask,$twres			# isolate carry and residue
+	 xorps	`16*1`(%rsp),$inout1
+	pcmpgtd	@tweak[5],$twtmp		# broadcat upper bits
+	pxor	$twres,@tweak[5]
+
+	xorps	`16*2`(%rsp),$inout2
+	movups	$inout0,`16*0`($out)		# write output
+	xorps	`16*3`(%rsp),$inout3
+	movups	$inout1,`16*1`($out)
+	xorps	`16*4`(%rsp),$inout4
+	movups	$inout2,`16*2`($out)
+	xorps	`16*5`(%rsp),$inout5
+	movups	$inout3,`16*3`($out)
+	mov	$rnds_,$rounds			# restore $rounds
+	movups	$inout4,`16*4`($out)
+	movups	$inout5,`16*5`($out)
+	lea	`16*6`($out),$out
+	sub	\$16*6,$len
+	jnc	.Lxts_dec_grandloop
+
+	lea	3($rounds,$rounds),$rounds	# restore original value
+	mov	$key_,$key			# restore $key
+	mov	$rounds,$rnds_			# backup $rounds
+
+.Lxts_dec_short:
+	add	\$16*6,$len
+	jz	.Lxts_dec_done
+
+	cmp	\$0x20,$len
+	jb	.Lxts_dec_one
+	je	.Lxts_dec_two
+
+	cmp	\$0x40,$len
+	jb	.Lxts_dec_three
+	je	.Lxts_dec_four
+
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movdqu	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movdqu	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movdqu	16*2($inp),$inout2
+	pxor	@tweak[0],$inout0
+	movdqu	16*3($inp),$inout3
+	pxor	@tweak[1],$inout1
+	movdqu	16*4($inp),$inout4
+	lea	16*5($inp),$inp
+	pxor	@tweak[2],$inout2
+	pxor	@tweak[3],$inout3
+	pxor	@tweak[4],$inout4
+
+	call	_aesni_decrypt6
+
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	movdqu	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movdqu	$inout1,16*1($out)
+	xorps	@tweak[4],$inout4
+	movdqu	$inout2,16*2($out)
+	 pxor		$twtmp,$twtmp
+	movdqu	$inout3,16*3($out)
+	 pcmpgtd	@tweak[5],$twtmp
+	movdqu	$inout4,16*4($out)
+	lea	16*5($out),$out
+	 pshufd		\$0x13,$twtmp,@tweak[1]	# $twres
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+
+	movdqa	@tweak[5],@tweak[0]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	pand	$twmask,@tweak[1]		# isolate carry and residue
+	pxor	@tweak[5],@tweak[1]
+	jmp	.Lxts_dec_done2
+
+.align	16
+.Lxts_dec_one:
+	movups	($inp),$inout0
+	lea	16*1($inp),$inp
+	xorps	@tweak[0],$inout0
 ___
 	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[1],@tweak[0]
 	movups	$inout0,($out)
-	jmp	.Lecb_ret
+	movdqa	@tweak[2],@tweak[1]
+	lea	16*1($out),$out
+	jmp	.Lxts_dec_done
+
 .align	16
-.Lecb_dec_two:
+.Lxts_dec_two:
+	movups	($inp),$inout0
+	movups	16($inp),$inout1
+	lea	32($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+
 	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[2],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[3],@tweak[1]
 	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	jmp	.Lecb_ret
+	movups	$inout1,16*1($out)
+	lea	16*2($out),$out
+	jmp	.Lxts_dec_done
+
 .align	16
-.Lecb_dec_three:
+.Lxts_dec_three:
+	movups	($inp),$inout0
+	movups	16*1($inp),$inout1
+	movups	16*2($inp),$inout2
+	lea	16*3($inp),$inp
+	xorps	@tweak[0],$inout0
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+
 	call	_aesni_decrypt3
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[3],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
 	movups	$inout0,($out)
-	movups	$inout1,0x10($out)
-	movups	$inout2,0x20($out)
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	lea	16*3($out),$out
+	jmp	.Lxts_dec_done
 
-.Lecb_ret:
+.align	16
+.Lxts_dec_four:
+	pshufd	\$0x13,$twtmp,$twres
+	movdqa	@tweak[5],@tweak[4]
+	paddq	@tweak[5],@tweak[5]		# psllq 1,$tweak
+	 movups	($inp),$inout0
+	pand	$twmask,$twres			# isolate carry and residue
+	 movups	16*1($inp),$inout1
+	pxor	$twres,@tweak[5]
+
+	movups	16*2($inp),$inout2
+	xorps	@tweak[0],$inout0
+	movups	16*3($inp),$inout3
+	lea	16*4($inp),$inp
+	xorps	@tweak[1],$inout1
+	xorps	@tweak[2],$inout2
+	xorps	@tweak[3],$inout3
+
+	call	_aesni_decrypt4
+
+	xorps	@tweak[0],$inout0
+	movdqa	@tweak[4],@tweak[0]
+	xorps	@tweak[1],$inout1
+	movdqa	@tweak[5],@tweak[1]
+	xorps	@tweak[2],$inout2
+	movups	$inout0,($out)
+	xorps	@tweak[3],$inout3
+	movups	$inout1,16*1($out)
+	movups	$inout2,16*2($out)
+	movups	$inout3,16*3($out)
+	lea	16*4($out),$out
+	jmp	.Lxts_dec_done
+
+.align	16
+.Lxts_dec_done:
+	and	\$15,$len_
+	jz	.Lxts_dec_ret
+.Lxts_dec_done2:
+	mov	$len_,$len
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($inp),$inout0
+	xorps	@tweak[1],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[1],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp),%eax			# borrow $rounds ...
+	movzb	($out),%ecx			# ... and $key
+	lea	1($inp),$inp
+	mov	%al,($out)
+	mov	%cl,16($out)
+	lea	1($out),$out
+	sub	\$1,$len
+	jnz	.Lxts_dec_steal
+
+	sub	$len_,$out			# rewind $out
+	mov	$key_,$key			# restore $key
+	mov	$rnds_,$rounds			# restore $rounds
+
+	movups	($out),$inout0
+	xorps	@tweak[0],$inout0
+___
+	&aesni_generate1("dec",$key,$rounds);
+$code.=<<___;
+	xorps	@tweak[0],$inout0
+	movups	$inout0,($out)
+
+.Lxts_dec_ret:
+___
+$code.=<<___ if ($win64);
+	movaps	0x60(%rsp),%xmm6
+	movaps	0x70(%rsp),%xmm7
+	movaps	0x80(%rsp),%xmm8
+	movaps	0x90(%rsp),%xmm9
+	movaps	0xa0(%rsp),%xmm10
+	movaps	0xb0(%rsp),%xmm11
+	movaps	0xc0(%rsp),%xmm12
+	movaps	0xd0(%rsp),%xmm13
+	movaps	0xe0(%rsp),%xmm14
+	movaps	0xf0(%rsp),%xmm15
+___
+$code.=<<___;
+	lea	$frame_size(%rsp),%rsp
+.Lxts_dec_epilogue:
 	ret
-.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
+.size	aesni_xts_decrypt,.-aesni_xts_decrypt
 ___
-}
+} }}
 
+########################################################################
 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
 #			    size_t length, const AES_KEY *key,
 #			    unsigned char *ivp,const int enc);
-$reserved = $win64?0x40:-0x18;	# used in decrypt
+{
+my $reserved = $win64?0x40:-0x18;	# used in decrypt
 $code.=<<___;
 .globl	${PREFIX}_cbc_encrypt
 .type	${PREFIX}_cbc_encrypt,\@function,6
@@ -371,30 +2129,30 @@ ${PREFIX}_cbc_encrypt:
 	test	$len,$len		# check length
 	jz	.Lcbc_ret
 
-	mov	240($key),$rnds_	# pull $rounds
+	mov	240($key),$rnds_	# key->rounds
 	mov	$key,$key_		# backup $key
 	test	%r9d,%r9d		# 6th argument
 	jz	.Lcbc_decrypt
 #--------------------------- CBC ENCRYPT ------------------------------#
 	movups	($ivp),$inout0		# load iv as initial state
-	cmp	\$16,$len
 	mov	$rnds_,$rounds
+	cmp	\$16,$len
 	jb	.Lcbc_enc_tail
 	sub	\$16,$len
 	jmp	.Lcbc_enc_loop
-.align 16
+.align	16
 .Lcbc_enc_loop:
 	movups	($inp),$inout1		# load input
 	lea	16($inp),$inp
-	pxor	$inout1,$inout0
+	#xorps	$inout1,$inout0
 ___
-	&aesni_generate1("enc",$key,$rounds);
+	&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
 $code.=<<___;
-	sub	\$16,$len
-	lea	16($out),$out
 	mov	$rnds_,$rounds		# restore $rounds
 	mov	$key_,$key		# restore $key
-	movups	$inout0,-16($out)	# store output
+	movups	$inout0,0($out)		# store output
+	lea	16($out),$out
+	sub	\$16,$len
 	jnc	.Lcbc_enc_loop
 	add	\$16,$len
 	jnz	.Lcbc_enc_tail
@@ -429,92 +2187,238 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	movups	($ivp),$iv
-	sub	\$0x40,$len
 	mov	$rnds_,$rounds
+	cmp	\$0x70,$len
 	jbe	.Lcbc_dec_tail
-	jmp	.Lcbc_dec_loop3
-.align 16
-.Lcbc_dec_loop3:
-	movups	($inp),$inout0
+	shr	\$1,$rnds_
+	sub	\$0x70,$len
+	mov	$rnds_,$rounds
+	movaps	$iv,$reserved(%rsp)
+	jmp	.Lcbc_dec_loop8_enter
+.align	16
+.Lcbc_dec_loop8:
+	movaps	$rndkey0,$reserved(%rsp)	# save IV
+	movups	$inout7,($out)
+	lea	0x10($out),$out
+.Lcbc_dec_loop8_enter:
+	$movkey		($key),$rndkey0
+	movups	($inp),$inout0			# load input
 	movups	0x10($inp),$inout1
-	movups	0x20($inp),$inout2
-	movaps	$inout0,$in0
-	movaps	$inout1,$in1
-	movaps	$inout2,$in2
-	call	_aesni_decrypt3
-	sub	\$0x30,$len
-	lea	0x30($inp),$inp
-	lea	0x30($out),$out
-	pxor	$iv,$inout0
-	pxor	$in0,$inout1
-	movaps	$in2,$iv
-	pxor	$in1,$inout2
-	movups	$inout0,-0x30($out)
-	mov	$rnds_,$rounds	# restore $rounds
-	movups	$inout1,-0x20($out)
-	mov	$key_,$key	# restore $key
-	movups	$inout2,-0x10($out)
-	ja	.Lcbc_dec_loop3
+	$movkey		16($key),$rndkey1
 
-.Lcbc_dec_tail:
-	add	\$0x40,$len
-	movups	$iv,($ivp)
-	jz	.Lcbc_dec_ret
+	lea		32($key),$key
+	movdqu	0x20($inp),$inout2
+	xorps		$rndkey0,$inout0
+	movdqu	0x30($inp),$inout3
+	xorps		$rndkey0,$inout1
+	movdqu	0x40($inp),$inout4
+	aesdec		$rndkey1,$inout0
+	pxor		$rndkey0,$inout2
+	movdqu	0x50($inp),$inout5
+	aesdec		$rndkey1,$inout1
+	pxor		$rndkey0,$inout3
+	movdqu	0x60($inp),$inout6
+	aesdec		$rndkey1,$inout2
+	pxor		$rndkey0,$inout4
+	movdqu	0x70($inp),$inout7
+	aesdec		$rndkey1,$inout3
+	pxor		$rndkey0,$inout5
+	dec		$rounds
+	aesdec		$rndkey1,$inout4
+	pxor		$rndkey0,$inout6
+	aesdec		$rndkey1,$inout5
+	pxor		$rndkey0,$inout7
+	$movkey		($key),$rndkey0
+	aesdec		$rndkey1,$inout6
+	aesdec		$rndkey1,$inout7
+	$movkey		16($key),$rndkey1
 
+	call		.Ldec_loop8_enter
+
+	movups	($inp),$rndkey1		# re-load input
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$rndkey1
+	xorps	$rndkey0,$inout6
+	movups	0x70($inp),$rndkey0	# IV
+	xorps	$rndkey1,$inout7
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	mov	$rnds_,$rounds		# restore $rounds
+	movups	$inout4,0x40($out)
+	mov	$key_,$key		# restore $key
+	movups	$inout5,0x50($out)
+	lea	0x80($inp),$inp
+	movups	$inout6,0x60($out)
+	lea	0x70($out),$out
+	sub	\$0x80,$len
+	ja	.Lcbc_dec_loop8
+
+	movaps	$inout7,$inout0
+	movaps	$rndkey0,$iv
+	add	\$0x70,$len
+	jle	.Lcbc_dec_tail_collected
+	movups	$inout0,($out)
+	lea	1($rnds_,$rnds_),$rounds
+	lea	0x10($out),$out
+.Lcbc_dec_tail:
 	movups	($inp),$inout0
-	cmp	\$0x10,$len
 	movaps	$inout0,$in0
+	cmp	\$0x10,$len
 	jbe	.Lcbc_dec_one
+
 	movups	0x10($inp),$inout1
-	cmp	\$0x20,$len
 	movaps	$inout1,$in1
+	cmp	\$0x20,$len
 	jbe	.Lcbc_dec_two
+
 	movups	0x20($inp),$inout2
-	cmp	\$0x30,$len
 	movaps	$inout2,$in2
+	cmp	\$0x30,$len
 	jbe	.Lcbc_dec_three
+
 	movups	0x30($inp),$inout3
-	call	_aesni_decrypt4
-	pxor	$iv,$inout0
-	movups	0x30($inp),$iv
-	pxor	$in0,$inout1
+	cmp	\$0x40,$len
+	jbe	.Lcbc_dec_four
+
+	movups	0x40($inp),$inout4
+	cmp	\$0x50,$len
+	jbe	.Lcbc_dec_five
+
+	movups	0x50($inp),$inout5
+	cmp	\$0x60,$len
+	jbe	.Lcbc_dec_six
+
+	movups	0x60($inp),$inout6
+	movaps	$iv,$reserved(%rsp)	# save IV
+	call	_aesni_decrypt8
+	movups	($inp),$rndkey1
+	movups	0x10($inp),$rndkey0
+	xorps	$reserved(%rsp),$inout0	# ^= IV
+	xorps	$rndkey1,$inout1
+	movups	0x20($inp),$rndkey1
+	xorps	$rndkey0,$inout2
+	movups	0x30($inp),$rndkey0
+	xorps	$rndkey1,$inout3
+	movups	0x40($inp),$rndkey1
+	xorps	$rndkey0,$inout4
+	movups	0x50($inp),$rndkey0
+	xorps	$rndkey1,$inout5
+	movups	0x60($inp),$iv		# IV
+	xorps	$rndkey0,$inout6
 	movups	$inout0,($out)
-	pxor	$in1,$inout2
 	movups	$inout1,0x10($out)
-	pxor	$in2,$inout3
 	movups	$inout2,0x20($out)
-	movaps	$inout3,$inout0
-	lea	0x30($out),$out
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	movups	$inout5,0x50($out)
+	lea	0x60($out),$out
+	movaps	$inout6,$inout0
+	sub	\$0x70,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_one:
 ___
 	&aesni_generate1("dec",$key,$rounds);
 $code.=<<___;
-	pxor	$iv,$inout0
+	xorps	$iv,$inout0
 	movaps	$in0,$iv
+	sub	\$0x10,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_two:
+	xorps	$inout2,$inout2
 	call	_aesni_decrypt3
-	pxor	$iv,$inout0
-	pxor	$in0,$inout1
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
 	movups	$inout0,($out)
 	movaps	$in1,$iv
 	movaps	$inout1,$inout0
 	lea	0x10($out),$out
+	sub	\$0x20,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_three:
 	call	_aesni_decrypt3
-	pxor	$iv,$inout0
-	pxor	$in0,$inout1
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
 	movups	$inout0,($out)
-	pxor	$in1,$inout2
+	xorps	$in1,$inout2
 	movups	$inout1,0x10($out)
 	movaps	$in2,$iv
 	movaps	$inout2,$inout0
 	lea	0x20($out),$out
+	sub	\$0x30,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_four:
+	call	_aesni_decrypt4
+	xorps	$iv,$inout0
+	movups	0x30($inp),$iv
+	xorps	$in0,$inout1
+	movups	$inout0,($out)
+	xorps	$in1,$inout2
+	movups	$inout1,0x10($out)
+	xorps	$in2,$inout3
+	movups	$inout2,0x20($out)
+	movaps	$inout3,$inout0
+	lea	0x30($out),$out
+	sub	\$0x40,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_five:
+	xorps	$inout5,$inout5
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$iv
+	xorps	$rndkey1,$inout4
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	lea	0x40($out),$out
+	movaps	$inout4,$inout0
+	sub	\$0x50,$len
+	jmp	.Lcbc_dec_tail_collected
+.align	16
+.Lcbc_dec_six:
+	call	_aesni_decrypt6
+	movups	0x10($inp),$rndkey1
+	movups	0x20($inp),$rndkey0
+	xorps	$iv,$inout0
+	xorps	$in0,$inout1
+	xorps	$rndkey1,$inout2
+	movups	0x30($inp),$rndkey1
+	xorps	$rndkey0,$inout3
+	movups	0x40($inp),$rndkey0
+	xorps	$rndkey1,$inout4
+	movups	0x50($inp),$iv
+	xorps	$rndkey0,$inout5
+	movups	$inout0,($out)
+	movups	$inout1,0x10($out)
+	movups	$inout2,0x20($out)
+	movups	$inout3,0x30($out)
+	movups	$inout4,0x40($out)
+	lea	0x50($out),$out
+	movaps	$inout5,$inout0
+	sub	\$0x60,$len
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_tail_collected:
@@ -523,10 +2427,12 @@ $code.=<<___;
 	jnz	.Lcbc_dec_tail_partial
 	movups	$inout0,($out)
 	jmp	.Lcbc_dec_ret
+.align	16
 .Lcbc_dec_tail_partial:
 	movaps	$inout0,$reserved(%rsp)
+	mov	\$16,%rcx
 	mov	$out,%rdi
-	mov	$len,%rcx
+	sub	$len,%rcx
 	lea	$reserved(%rsp),%rsi
 	.long	0x9066A4F3	# rep movsb
 
@@ -544,7 +2450,7 @@ $code.=<<___;
 	ret
 .size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
 ___
-
+} 
 # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
 #				int bits, AES_KEY *key)
 { my ($inp,$bits,$key) = @_4args;
@@ -556,7 +2462,7 @@ $code.=<<___;
 .align	16
 ${PREFIX}_set_decrypt_key:
 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
-	call	_aesni_set_encrypt_key
+	call	__aesni_set_encrypt_key
 	shl	\$4,$bits		# rounds-1 after _aesni_set_encrypt_key
 	test	%eax,%eax
 	jnz	.Ldec_key_ret
@@ -576,9 +2482,9 @@ ${PREFIX}_set_decrypt_key:
 	aesimc	%xmm1,%xmm1
 	lea	16($key),$key
 	lea	-16($inp),$inp
-	cmp	$key,$inp
 	$movkey	%xmm0,16($inp)
 	$movkey	%xmm1,-16($key)
+	cmp	$key,$inp
 	ja	.Ldec_key_inverse
 
 	$movkey	($key),%xmm0		# inverse middle
@@ -605,16 +2511,16 @@ $code.=<<___;
 .type	${PREFIX}_set_encrypt_key,\@abi-omnipotent
 .align	16
 ${PREFIX}_set_encrypt_key:
-_aesni_set_encrypt_key:
+__aesni_set_encrypt_key:
 	.byte	0x48,0x83,0xEC,0x08	# sub rsp,8
-	test	$inp,$inp
 	mov	\$-1,%rax
+	test	$inp,$inp
 	jz	.Lenc_key_ret
 	test	$key,$key
 	jz	.Lenc_key_ret
 
 	movups	($inp),%xmm0		# pull first 128 bits of *userKey
-	pxor	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
+	xorps	%xmm4,%xmm4		# low dword of xmm4 is assumed 0
 	lea	16($key),%rax
 	cmp	\$256,$bits
 	je	.L14rounds
@@ -729,11 +2635,11 @@ _aesni_set_encrypt_key:
 	lea	16(%rax),%rax
 .Lkey_expansion_128_cold:
 	shufps	\$0b00010000,%xmm0,%xmm4
-	pxor	%xmm4, %xmm0
+	xorps	%xmm4, %xmm0
 	shufps	\$0b10001100,%xmm0,%xmm4
-	pxor	%xmm4, %xmm0
-	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
-	pxor	%xmm1,%xmm0
+	xorps	%xmm4, %xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
 	ret
 
 .align 16
@@ -744,11 +2650,11 @@ _aesni_set_encrypt_key:
 	movaps	%xmm2, %xmm5
 .Lkey_expansion_192b_warm:
 	shufps	\$0b00010000,%xmm0,%xmm4
-	movaps	%xmm2,%xmm3
-	pxor	%xmm4,%xmm0
+	movdqa	%xmm2,%xmm3
+	xorps	%xmm4,%xmm0
 	shufps	\$0b10001100,%xmm0,%xmm4
 	pslldq	\$4,%xmm3
-	pxor	%xmm4,%xmm0
+	xorps	%xmm4,%xmm0
 	pshufd	\$0b01010101,%xmm1,%xmm1	# critical path
 	pxor	%xmm3,%xmm2
 	pxor	%xmm1,%xmm0
@@ -772,11 +2678,11 @@ _aesni_set_encrypt_key:
 	lea	16(%rax),%rax
 .Lkey_expansion_256a_cold:
 	shufps	\$0b00010000,%xmm0,%xmm4
-	pxor	%xmm4,%xmm0
+	xorps	%xmm4,%xmm0
 	shufps	\$0b10001100,%xmm0,%xmm4
-	pxor	%xmm4,%xmm0
-	pshufd	\$0b11111111,%xmm1,%xmm1	# critical path
-	pxor	%xmm1,%xmm0
+	xorps	%xmm4,%xmm0
+	shufps	\$0b11111111,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm0
 	ret
 
 .align 16
@@ -785,17 +2691,28 @@ _aesni_set_encrypt_key:
 	lea	16(%rax),%rax
 
 	shufps	\$0b00010000,%xmm2,%xmm4
-	pxor	%xmm4,%xmm2
+	xorps	%xmm4,%xmm2
 	shufps	\$0b10001100,%xmm2,%xmm4
-	pxor	%xmm4,%xmm2
-	pshufd	\$0b10101010,%xmm1,%xmm1	# critical path
-	pxor	%xmm1,%xmm2
+	xorps	%xmm4,%xmm2
+	shufps	\$0b10101010,%xmm1,%xmm1	# critical path
+	xorps	%xmm1,%xmm2
 	ret
 .size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
 ___
 }
 
 $code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement32:
+	.long	6,6,6,0
+.Lincrement64:
+	.long	1,0,0,0
+.Lxts_magic:
+	.long	0x87,0,1,0
+
 .asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 ___
@@ -810,9 +2727,11 @@ $disp="%r9";
 
 $code.=<<___;
 .extern	__imp_RtlVirtualUnwind
-.type	cbc_se_handler,\@abi-omnipotent
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type	ecb_se_handler,\@abi-omnipotent
 .align	16
-cbc_se_handler:
+ecb_se_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -825,42 +2744,132 @@ cbc_se_handler:
 	sub	\$64,%rsp
 
 	mov	152($context),%rax	# pull context->Rsp
+
+	jmp	.Lcommon_seh_tail
+.size	ecb_se_handler,.-ecb_se_handler
+
+.type	ccm64_se_handler,\@abi-omnipotent
+.align	16
+ccm64_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
 	mov	248($context),%rbx	# pull context->Rip
 
-	lea	.Lcbc_decrypt(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<"prologue" label
-	jb	.Lin_prologue
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
 
-	lea	.Lcbc_decrypt_body(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
-	jb	.Lrestore_rax
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
 
-	lea	.Lcbc_ret(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>="epilogue" label
-	jae	.Lin_prologue
+	mov	152($context),%rax	# pull context->Rsp
 
-	lea	0(%rax),%rsi		# top of stack
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# %xmm save area
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
 	lea	0x58(%rax),%rax		# adjust stack pointer
-	jmp	.Lin_prologue
 
-.Lrestore_rax:
-	mov	120($context),%rax
-.Lin_prologue:
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rax,152($context)	# restore context->Rsp
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
+	jmp	.Lcommon_seh_tail
+.size	ccm64_se_handler,.-ccm64_se_handler
 
-	jmp	.Lcommon_seh_exit
-.size	cbc_se_handler,.-cbc_se_handler
+.type	ctr32_se_handler,\@abi-omnipotent
+.align	16
+ctr32_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
 
-.type	ecb_se_handler,\@abi-omnipotent
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lctr32_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lctr32_ret(%rip),%r10
+	cmp	%r10,%rbx
+	jae	.Lcommon_seh_tail
+
+	lea	0x20(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xc8(%rax),%rax		# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	ctr32_se_handler,.-ctr32_se_handler
+
+.type	xts_se_handler,\@abi-omnipotent
 .align	16
-ecb_se_handler:
+xts_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue lable
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	0x60(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x68+160(%rax),%rax	# adjust stack pointer
+
+	jmp	.Lcommon_seh_tail
+.size	xts_se_handler,.-xts_se_handler
+___
+$code.=<<___;
+.type	cbc_se_handler,\@abi-omnipotent
+.align	16
+cbc_se_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -873,13 +2882,37 @@ ecb_se_handler:
 	sub	\$64,%rsp
 
 	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lcbc_decrypt(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lcommon_seh_tail
+
+	lea	.Lcbc_decrypt_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
+	jb	.Lrestore_cbc_rax
+
+	lea	.Lcbc_ret(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>="epilogue" label
+	jae	.Lcommon_seh_tail
+
+	lea	0(%rax),%rsi		# top of stack
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x58(%rax),%rax		# adjust stack pointer
+	jmp	.Lcommon_seh_tail
+
+.Lrestore_cbc_rax:
+	mov	120($context),%rax
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
 	mov	%rsi,168($context)	# restore context->Rsi
 	mov	%rdi,176($context)	# restore context->Rdi
 
-.Lcommon_seh_exit:
-
 	mov	40($disp),%rdi		# disp->ContextRecord
 	mov	$context,%rsi		# context
 	mov	\$154,%ecx		# sizeof(CONTEXT)
@@ -915,10 +2948,33 @@ ecb_se_handler:
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_${PREFIX}_ecb_encrypt
-	.rva	.LSEH_end_${PREFIX}_ecb_encrypt
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+	.rva	.LSEH_begin_aesni_ecb_encrypt
+	.rva	.LSEH_end_aesni_ecb_encrypt
 	.rva	.LSEH_info_ecb
 
+	.rva	.LSEH_begin_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_encrypt_blocks
+	.rva	.LSEH_info_ccm64_enc
+
+	.rva	.LSEH_begin_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_end_aesni_ccm64_decrypt_blocks
+	.rva	.LSEH_info_ccm64_dec
+
+	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_info_ctr32
+
+	.rva	.LSEH_begin_aesni_xts_encrypt
+	.rva	.LSEH_end_aesni_xts_encrypt
+	.rva	.LSEH_info_xts_enc
+
+	.rva	.LSEH_begin_aesni_xts_decrypt
+	.rva	.LSEH_end_aesni_xts_decrypt
+	.rva	.LSEH_info_xts_dec
+___
+$code.=<<___;
 	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
 	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
 	.rva	.LSEH_info_cbc
@@ -932,28 +2988,49 @@ ecb_se_handler:
 	.rva	.LSEH_info_key
 .section	.xdata
 .align	8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
 .LSEH_info_ecb:
 	.byte	9,0,0,0
 	.rva	ecb_se_handler
+.LSEH_info_ccm64_enc:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_enc_body,.Lccm64_enc_ret	# HandlerData[]
+.LSEH_info_ccm64_dec:
+	.byte	9,0,0,0
+	.rva	ccm64_se_handler
+	.rva	.Lccm64_dec_body,.Lccm64_dec_ret	# HandlerData[]
+.LSEH_info_ctr32:
+	.byte	9,0,0,0
+	.rva	ctr32_se_handler
+.LSEH_info_xts_enc:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.LSEH_info_xts_dec:
+	.byte	9,0,0,0
+	.rva	xts_se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
 .LSEH_info_cbc:
 	.byte	9,0,0,0
 	.rva	cbc_se_handler
 .LSEH_info_key:
 	.byte	0x01,0x04,0x01,0x00
-	.byte	0x04,0x02,0x00,0x00
+	.byte	0x04,0x02,0x00,0x00	# sub rsp,8
 ___
 }
 
 sub rex {
- local *opcode=shift;
- my ($dst,$src)=@_;
-
-   if ($dst>=8 || $src>=8) {
-	$rex=0x40;
-	$rex|=0x04 if($dst>=8);
-	$rex|=0x01 if($src>=8);
-	push @opcode,$rex;
-   }
+  local *opcode=shift;
+  my ($dst,$src)=@_;
+  my $rex=0;
+
+    $rex|=0x04			if($dst>=8);
+    $rex|=0x01			if($src>=8);
+    push @opcode,$rex|0x40	if($rex);
 }
 
 sub aesni {
@@ -989,4 +3066,3 @@ $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
 print $code;
 
 close STDOUT;
-
diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
new file mode 100644
index 0000000000..c9c6312fa7
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl
@@ -0,0 +1,3044 @@
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode]				###
+### bitsliced implementation for Intel Core 2 processors	###
+### requires support of SSE extensions up to SSSE3		###
+### Author: Emilia Käsper and Peter Schwabe			###
+### Date: 2009-03-19						###
+### Public domain						###
+###								###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for	###
+### further information.					###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+#   from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+#   allowed to feed its output back to aesenc[last], this was
+#   achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+#   relies on conversion of "conventional" key schedule as returned
+#   by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+#   to skip one shiftrows(), reduce bit-sliced key schedule and
+#   speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+#		Emilia's	this(*)		difference
+#
+# Core 2    	9.30		8.69		+7%
+# Nehalem(**) 	7.63		6.98		+9%
+# Atom	    	17.1		17.4		-2%(***)
+#
+# (*)	Comparison is not completely fair, because "this" is ECB,
+#	i.e. no extra processing such as counter values calculation
+#	and xor-ing input as in Emilia's CTR implementation is
+#	performed. However, the CTR calculations stand for not more
+#	than 1% of total time, so comparison is *rather* fair.
+#
+# (**)	Results were collected on Westmere, which is considered to
+#	be equivalent to Nehalem for this code.
+#
+# (***)	Slowdown on Atom is rather strange per se, because original
+#	implementation has a number of 9+-bytes instructions, which
+#	are bad for Atom front-end, and which I eliminated completely.
+#	In attempt to address deterioration sbox() was tested in FP
+#	SIMD "domain" (movaps instead of movdqa, xorps instead of
+#	pxor, etc.). While it resulted in nominal 4% improvement on
+#	Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+# 		conversion	conversion/8x block
+# Core 2	240		0.22
+# Nehalem	180		0.20
+# Atom		430		0.19
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2	11.0
+# Nehalem	9.16
+# Atom		20.9
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+#						<appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14));	# best on Atom, +10% over (0..15)
+my $ecb=0;	# suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InBasisChange	(@b);
+	&Inv_GF256	(@b[6,5,0,3,7,1,4,2],@t,@s);
+	&OutBasisChange	(@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[5]
+	pxor	@b[1], @b[2]
+	pxor	@b[0], @b[3]
+	pxor	@b[2], @b[6]
+	pxor 	@b[0], @b[5]
+
+	pxor	@b[3], @b[6]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[1], @b[3]
+
+	pxor	@b[7], @b[2]
+	pxor	@b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[1], @b[6]
+
+	pxor	@b[5], @b[1]
+	pxor	@b[3], @b[5]
+	pxor	@b[7], @b[3]
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+
+	pxor	@b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb 	> [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb	> [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+	&InvInBasisChange	(@b);
+	&Inv_GF256		(@b[5,1,2,6,3,7,0,4],@t,@s);
+	&InvOutBasisChange	(@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {		# OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+	pxor	@b[7], @b[4]
+
+	pxor	@b[5], @b[7]
+	pxor	@b[5], @b[2]
+	pxor	@b[7], @b[3]
+	pxor	@b[3], @b[5]
+	pxor	@b[5], @b[1]
+
+	pxor	@b[1], @b[6]
+	pxor	@b[0], @b[2]
+	pxor	@b[6], @b[4]
+	pxor	@b[6], @b[0]
+	pxor	@b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange {		# InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+	pxor	@b[5], @b[1]
+	pxor	@b[7], @b[2]
+
+	pxor	@b[1], @b[3]
+	pxor	@b[5], @b[4]
+	pxor	@b[5], @b[7]
+	pxor	@b[4], @b[3]
+	 pxor 	@b[0], @b[5]
+	pxor	@b[7], @b[3]
+	 pxor	@b[2], @b[6]
+	 pxor	@b[1], @b[2]
+	pxor	@b[3], @b[6]
+
+	pxor	@b[0], @b[3]
+	pxor	@b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor 	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x1, $x0
+	pxor	$t0, $x1
+___
+}
+
+sub Mul_GF4_N {				# not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	pxor	$y1, $t0
+	pand	$x0, $t0
+	pxor	$x1, $x0
+	pand	$y0, $x1
+	pand	$y1, $x0
+	pxor	$x0, $x1
+	pxor	$t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+	movdqa	$y0, $t0
+	 movdqa	$y2, $t1
+	pxor	$y1, $t0
+	 pxor 	$y3, $t1
+	pand	$x0, $t0
+	 pand	$x2, $t1
+	pxor	$x1, $x0
+	 pxor	$x3, $x2
+	pand	$y0, $x1
+	 pand	$y2, $x3
+	pand	$y1, $x0
+	 pand	$y3, $x2
+	pxor	$x0, $x1
+	 pxor	$x3, $x2
+	pxor	$t0, $x0
+	 pxor	$t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+	movdqa	@x[0], @t[0]
+	movdqa	@x[1], @t[1]
+___
+	&Mul_GF4  	(@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+	pxor	@x[2], @t[0]
+	pxor	@x[3], @t[1]
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@t[0], @x[0]
+	pxor	@t[0], @x[2]
+	pxor	@t[1], @x[1]
+	pxor	@t[1], @x[3]
+
+	movdqa	@x[4], @t[0]
+	movdqa	@x[5], @t[1]
+	pxor	@x[6], @t[0]
+	pxor	@x[7], @t[1]
+___
+	&Mul_GF4_N_GF4	(@t[0], @t[1], @y[0], @y[1], @t[3],
+			 @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+	pxor	@y[2], @y[0]
+	pxor	@y[3], @y[1]
+___
+	&Mul_GF4  	(@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+	pxor	@t[0], @x[4]
+	pxor	@t[0], @x[6]
+	pxor	@t[1], @x[5]
+	pxor	@t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+	movdqa	@x[4], @t[3]
+	movdqa	@x[5], @t[2]
+	movdqa	@x[1], @t[1]
+	movdqa	@x[7], @s[1]
+	movdqa	@x[0], @s[0]
+
+	pxor	@x[6], @t[3]
+	pxor	@x[7], @t[2]
+	pxor	@x[3], @t[1]
+	 movdqa	@t[3], @s[2]
+	pxor	@x[6], @s[1]
+	 movdqa	@t[2], @t[0]
+	pxor	@x[2], @s[0]
+	 movdqa	@t[3], @s[3]
+
+	por	@t[1], @t[2]
+	por	@s[0], @t[3]
+	pxor	@t[0], @s[3]
+	pand	@s[0], @s[2]
+	pxor	@t[1], @s[0]
+	pand	@t[1], @t[0]
+	pand	@s[0], @s[3]
+	movdqa	@x[3], @s[0]
+	pxor	@x[2], @s[0]
+	pand	@s[0], @s[1]
+	pxor	@s[1], @t[3]
+	pxor	@s[1], @t[2]
+	movdqa	@x[4], @s[1]
+	movdqa	@x[1], @s[0]
+	pxor	@x[5], @s[1]
+	pxor	@x[0], @s[0]
+	movdqa	@s[1], @t[1]
+	pand	@s[0], @s[1]
+	por	@s[0], @t[1]
+	pxor	@s[1], @t[0]
+	pxor	@s[3], @t[3]
+	pxor	@s[2], @t[2]
+	pxor	@s[3], @t[1]
+	movdqa	@x[7], @s[0]
+	pxor	@s[2], @t[0]
+	movdqa	@x[6], @s[1]
+	pxor	@s[2], @t[1]
+	movdqa	@x[5], @s[2]
+	pand	@x[3], @s[0]
+	movdqa	@x[4], @s[3]
+	pand	@x[2], @s[1]
+	pand	@x[1], @s[2]
+	por	@x[0], @s[3]
+	pxor	@s[0], @t[3]
+	pxor	@s[1], @t[2]
+	pxor	@s[2], @t[1]
+	pxor	@s[3], @t[0] 
+
+	#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+	# new smaller inversion
+
+	movdqa	@t[3], @s[0]
+	pand	@t[1], @t[3]
+	pxor	@t[2], @s[0]
+
+	movdqa	@t[0], @s[2]
+	movdqa	@s[0], @s[3]
+	pxor	@t[3], @s[2]
+	pand	@s[2], @s[3]
+
+	movdqa	@t[1], @s[1]
+	pxor	@t[2], @s[3]
+	pxor	@t[0], @s[1]
+
+	pxor	@t[2], @t[3]
+
+	pand	@t[3], @s[1]
+
+	movdqa	@s[2], @t[2]
+	pxor	@t[0], @s[1]
+
+	pxor	@s[1], @t[2]
+	pxor	@s[1], @t[1]
+
+	pand	@t[0], @t[2]
+
+	pxor	@t[2], @s[2]
+	pxor	@t[2], @t[1]
+
+	pand	@s[3], @s[2]
+
+	pxor	@s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+	&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+	pxor	0x00($key),@x[0]
+	pxor	0x10($key),@x[1]
+	pshufb	$mask,@x[0]
+	pxor	0x20($key),@x[2]
+	pshufb	$mask,@x[1]
+	pxor	0x30($key),@x[3]
+	pshufb	$mask,@x[2]
+	pxor	0x40($key),@x[4]
+	pshufb	$mask,@x[3]
+	pxor	0x50($key),@x[5]
+	pshufb	$mask,@x[4]
+	pxor	0x60($key),@x[6]
+	pshufb	$mask,@x[5]
+	pxor	0x70($key),@x[7]
+	pshufb	$mask,@x[6]
+	lea	0x80($key),$key
+	pshufb	$mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	pshufd	\$0x93, @x[0], @t[0]	# x0 <<< 32
+	pshufd	\$0x93, @x[1], @t[1]
+	 pxor	@t[0], @x[0]		# x0 ^ (x0 <<< 32)
+	pshufd	\$0x93, @x[2], @t[2]
+	 pxor	@t[1], @x[1]
+	pshufd	\$0x93, @x[3], @t[3]
+	 pxor	@t[2], @x[2]
+	pshufd	\$0x93, @x[4], @t[4]
+	 pxor	@t[3], @x[3]
+	pshufd	\$0x93, @x[5], @t[5]
+	 pxor	@t[4], @x[4]
+	pshufd	\$0x93, @x[6], @t[6]
+	 pxor	@t[5], @x[5]
+	pshufd	\$0x93, @x[7], @t[7]
+	 pxor	@t[6], @x[6]
+	 pxor	@t[7], @x[7]
+
+	pxor	@x[0], @t[1]
+	pxor	@x[7], @t[0]
+	pxor	@x[7], @t[1]
+	 pshufd	\$0x4E, @x[0], @x[0] 	# (x0 ^ (x0 <<< 32)) <<< 64)
+	pxor	@x[1], @t[2]
+	 pshufd	\$0x4E, @x[1], @x[1]
+	pxor	@x[4], @t[5]
+	 pxor	@t[0], @x[0]
+	pxor	@x[5], @t[6]
+	 pxor	@t[1], @x[1]
+	pxor	@x[3], @t[4]
+	 pshufd	\$0x4E, @x[4], @t[0]
+	pxor	@x[6], @t[7]
+	 pshufd	\$0x4E, @x[5], @t[1]
+	pxor	@x[2], @t[3]
+	 pshufd	\$0x4E, @x[3], @x[4]
+	pxor	@x[7], @t[3]
+	 pshufd	\$0x4E, @x[7], @x[5]
+	pxor	@x[7], @t[4]
+	 pshufd	\$0x4E, @x[6], @x[3]
+	pxor	@t[4], @t[0]
+	 pshufd	\$0x4E, @x[2], @x[6]
+	pxor	@t[5], @t[1]
+
+	pxor	@t[3], @x[4]
+	pxor	@t[7], @x[5]
+	pxor	@t[6], @x[3]
+	 movdqa	@t[0], @x[2]
+	pxor	@t[2], @x[6]
+	 movdqa	@t[1], @x[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+	# multiplication by 0x0e
+	pshufd	\$0x93, @x[7], @t[7]
+	movdqa	@x[2], @t[2]
+	pxor	@x[5], @x[7]		# 7 5
+	pxor	@x[5], @x[2]		# 2 5
+	pshufd	\$0x93, @x[0], @t[0]
+	movdqa	@x[5], @t[5]
+	pxor	@x[0], @x[5]		# 5 0		[1]
+	pxor	@x[1], @x[0]		# 0 1
+	pshufd	\$0x93, @x[1], @t[1]
+	pxor	@x[2], @x[1]		# 1 25
+	pxor	@x[6], @x[0]		# 01 6		[2]
+	pxor	@x[3], @x[1]		# 125 3		[4]
+	pshufd	\$0x93, @x[3], @t[3]
+	pxor	@x[0], @x[2]		# 25 016	[3]
+	pxor	@x[7], @x[3]		# 3 75
+	pxor	@x[6], @x[7]		# 75 6		[0]
+	pshufd	\$0x93, @x[6], @t[6]
+	movdqa	@x[4], @t[4]
+	pxor	@x[4], @x[6]		# 6 4
+	pxor	@x[3], @x[4]		# 4 375		[6]
+	pxor	@x[7], @x[3]		# 375 756=36
+	pxor	@t[5], @x[6]		# 64 5		[7]
+	pxor	@t[2], @x[3]		# 36 2
+	pxor	@t[4], @x[3]		# 362 4		[5]
+	pshufd	\$0x93, @t[5], @t[5]
+___
+					my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+	# multiplication by 0x0b
+	pxor	@y[0], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[1], @y[1]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[5], @y[0]
+	pxor	@t[6], @y[1]
+	pxor	@t[7], @y[0]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @t[7]		# clobber t[7]
+	pxor	@y[0], @y[1]
+
+	pxor	@t[0], @y[3]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[1], @y[2]
+	pxor	@t[1], @y[4]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[1], @t[1]
+	pxor	@t[2], @y[3]
+	pxor	@t[2], @y[5]
+	pxor	@t[7], @y[2]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[3], @y[3]
+	pxor	@t[3], @y[6]
+	pxor	@t[3], @y[4]
+	pshufd	\$0x93, @t[3], @t[3]
+	pxor	@t[4], @y[7]
+	pxor	@t[4], @y[5]
+	pxor	@t[7], @y[7]
+	pxor	@t[5], @y[3]
+	pxor	@t[4], @y[4]
+	pxor	@t[5], @t[7]		# clobber t[7] even more
+
+	pxor	@t[7], @y[5]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[7], @y[6]
+	pxor	@t[7], @y[4]
+
+	pxor	@t[5], @t[7]
+	pshufd	\$0x93, @t[5], @t[5]
+	pxor	@t[6], @t[7]		# restore t[7]
+
+	# multiplication by 0x0d
+	pxor	@y[7], @y[4]
+	pxor	@t[4], @y[7]
+	pshufd	\$0x93, @t[6], @t[6]
+	pxor	@t[0], @y[2]
+	pxor	@t[5], @y[7]
+	pxor	@t[2], @y[2]
+	pshufd	\$0x93, @t[7], @t[7]
+
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[1]
+	pxor	@t[0], @y[0]
+	pxor	@t[0], @y[3]
+	pxor	@t[5], @y[1]
+	pxor	@t[5], @y[0]
+	pxor	@t[7], @y[1]
+	pshufd	\$0x93, @t[0], @t[0]
+	pxor	@t[6], @y[0]
+	pxor	@y[1], @y[3]
+	pxor	@t[1], @y[4]
+	pshufd	\$0x93, @t[1], @t[1]
+
+	pxor	@t[7], @y[7]
+	pxor	@t[2], @y[4]
+	pxor	@t[2], @y[5]
+	pshufd	\$0x93, @t[2], @t[2]
+	pxor	@t[6], @y[2]
+	pxor	@t[3], @t[6]		# clobber t[6]
+	pxor	@y[7], @y[4]
+	pxor	@t[6], @y[3]
+
+	pxor	@t[6], @y[6]
+	pxor	@t[5], @y[5]
+	pxor	@t[4], @y[6]
+	pshufd	\$0x93, @t[4], @t[4]
+	pxor	@t[6], @y[5]
+	pxor	@t[7], @y[6]
+	pxor	@t[3], @t[6]		# restore t[6]
+
+	pshufd	\$0x93, @t[5], @t[5]
+	pshufd	\$0x93, @t[6], @t[6]
+	pshufd	\$0x93, @t[7], @t[7]
+	pshufd	\$0x93, @t[3], @t[3]
+
+	# multiplication by 0x09
+	pxor	@y[1], @y[4]
+	pxor	@y[1], @t[1]		# t[1]=y[1]
+	pxor	@t[5], @t[0]		# clobber t[0]
+	pxor	@t[5], @t[1]
+	pxor	@t[0], @y[3]
+	pxor	@y[0], @t[0]		# t[0]=y[0]
+	pxor	@t[6], @t[1]
+	pxor	@t[7], @t[6]		# clobber t[6]
+	pxor	@t[1], @y[4]
+	pxor	@t[4], @y[7]
+	pxor	@y[4], @t[4]		# t[4]=y[4]
+	pxor	@t[3], @y[6]
+	pxor	@y[3], @t[3]		# t[3]=y[3]
+	pxor	@t[2], @y[5]
+	pxor	@y[2], @t[2]		# t[2]=y[2]
+	pxor	@t[7], @t[3]
+	pxor	@y[5], @t[5]		# t[5]=y[5]
+	pxor	@t[6], @t[2]
+	pxor	@t[6], @t[5]
+	pxor	@y[6], @t[6]		# t[6]=y[6]
+	pxor	@y[7], @t[7]		# t[7]=y[7]
+
+	movdqa	@t[0],@XMM[0]
+	movdqa	@t[1],@XMM[1]
+	movdqa	@t[2],@XMM[2]
+	movdqa	@t[3],@XMM[3]
+	movdqa	@t[4],@XMM[4]
+	movdqa	@t[5],@XMM[5]
+	movdqa	@t[6],@XMM[6]
+	movdqa	@t[7],@XMM[7]
+___
+}
+
+sub aesenc {				# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x30($const),@t[0]	# .LSR
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+	&MixColumns	(@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast {			# not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+	movdqa	0x40($const),@t[0]	# .LSRM0
+___
+	&ShiftRows	(@b,@t[0]);
+	&Sbox		(@b,@t);
+$code.=<<___
+	pxor	0x00($key),@b[0]
+	pxor	0x10($key),@b[1]
+	pxor	0x20($key),@b[4]
+	pxor	0x30($key),@b[6]
+	pxor	0x40($key),@b[3]
+	pxor	0x50($key),@b[7]
+	pxor	0x60($key),@b[2]
+	pxor	0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+	movdqa	$b,$t
+	psrlq	\$$n,$b
+	pxor  	$a,$b
+	pand	$mask,$b
+	pxor	$b,$a
+	psllq	\$$n,$b
+	pxor	$t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+	movdqa	$b0,$t0
+	psrlq	\$$n,$b0
+	 movdqa	$b1,$t1
+	 psrlq	\$$n,$b1
+	pxor  	$a0,$b0
+	 pxor  	$a1,$b1
+	pand	$mask,$b0
+	 pand	$mask,$b1
+	pxor	$b0,$a0
+	psllq	\$$n,$b0
+	 pxor	$b1,$a1
+	 psllq	\$$n,$b1
+	pxor	$t0,$b0
+	 pxor	$t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+	movdqa	0x00($const),$t0	# .LBS0
+	movdqa	0x10($const),$t1	# .LBS1
+___
+	&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+	&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+	movdqa	0x20($const),$t0	# .LBS2
+___
+	&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+	&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+	&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+	&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern	asm_AES_encrypt
+.extern	asm_AES_decrypt
+
+.type	_bsaes_encrypt8,\@abi-omnipotent
+.align	64
+_bsaes_encrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	0x50($const), @XMM[8]	# .LM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Lenc_sbox
+.align	16
+.Lenc_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+	&Sbox		(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Lenc_done
+___
+	&MixColumns	(@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+	movdqa	0x30($const), @XMM[8]	# .LSR
+	jnz	.Lenc_loop
+	movdqa	0x40($const), @XMM[8]	# .LSRM0
+	jmp	.Lenc_loop
+.align	16
+.Lenc_done:
+___
+	# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+	&bitslice	(@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type	_bsaes_decrypt8,\@abi-omnipotent
+.align	64
+_bsaes_decrypt8:
+	lea	.LBS0(%rip), $const	# constants table
+
+	movdqa	($key), @XMM[9]		# round 0 key
+	lea	0x10($key), $key
+	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	 pshufb	@XMM[8], @XMM[7]
+___
+	&bitslice	(@XMM[0..7, 8..11]);
+$code.=<<___;
+	dec	$rounds
+	jmp	.Ldec_sbox
+.align	16
+.Ldec_loop:
+___
+	&ShiftRows	(@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+	&InvSbox	(@XMM[0..7, 8..15]);
+$code.=<<___;
+	dec	$rounds
+	jl	.Ldec_done
+___
+	&InvMixColumns	(@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+	movdqa	-0x10($const), @XMM[8]	# .LISR
+	jnz	.Ldec_loop
+	movdqa	-0x20($const), @XMM[8]	# .LISRM0
+	jmp	.Ldec_loop
+.align	16
+.Ldec_done:
+___
+	&bitslice	(@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+	movdqa	($key), @XMM[8]		# last round key
+	pxor	@XMM[8], @XMM[6]
+	pxor	@XMM[8], @XMM[4]
+	pxor	@XMM[8], @XMM[2]
+	pxor	@XMM[8], @XMM[7]
+	pxor	@XMM[8], @XMM[3]
+	pxor	@XMM[8], @XMM[5]
+	pxor	@XMM[8], @XMM[0]
+	pxor	@XMM[8], @XMM[1]
+	ret
+.size	_bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+	&swapmove	(@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+	#&swapmove(@x[2,3],1,$t0,$t2,$t3);
+	movdqa	@x[0], @x[2]
+	movdqa	@x[1], @x[3]
+___
+	#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+	&swapmove2x	(@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+	#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+	movdqa	@x[0], @x[4]
+	movdqa	@x[2], @x[6]
+	movdqa	@x[1], @x[5]
+	movdqa	@x[3], @x[7]
+___
+	&swapmove2x	(@x[0,4,1,5],4,$bs2,$t2,$t3);
+	&swapmove2x	(@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type	_bsaes_key_convert,\@abi-omnipotent
+.align	16
+_bsaes_key_convert:
+	lea	.Lmasks(%rip), $const
+	movdqu	($inp), %xmm7		# load round 0 key
+	lea	0x10($inp), $inp
+	movdqa	0x00($const), %xmm0	# 0x01...
+	movdqa	0x10($const), %xmm1	# 0x02...
+	movdqa	0x20($const), %xmm2	# 0x04...
+	movdqa	0x30($const), %xmm3	# 0x08...
+	movdqa	0x40($const), %xmm4	# .LM0
+	pcmpeqd	%xmm5, %xmm5		# .LNOT
+
+	movdqu	($inp), %xmm6		# load round 1 key
+	movdqa	%xmm7, ($out)		# save round 0 key
+	lea	0x10($out), $out
+	dec	$rounds
+	jmp	.Lkey_loop
+.align	16
+.Lkey_loop:
+	pshufb	%xmm4, %xmm6		# .LM0
+
+	movdqa	%xmm0,	%xmm8
+	movdqa	%xmm1,	%xmm9
+
+	pand	%xmm6,	%xmm8
+	pand	%xmm6,	%xmm9
+	movdqa	%xmm2,	%xmm10
+	pcmpeqb	%xmm0,	%xmm8
+	psllq	\$4,	%xmm0		# 0x10...
+	movdqa	%xmm3,	%xmm11
+	pcmpeqb	%xmm1,	%xmm9
+	psllq	\$4,	%xmm1		# 0x20...
+
+	pand	%xmm6,	%xmm10
+	pand	%xmm6,	%xmm11
+	movdqa	%xmm0,	%xmm12
+	pcmpeqb	%xmm2,	%xmm10
+	psllq	\$4,	%xmm2		# 0x40...
+	movdqa	%xmm1,	%xmm13
+	pcmpeqb	%xmm3,	%xmm11
+	psllq	\$4,	%xmm3		# 0x80...
+
+	movdqa	%xmm2,	%xmm14
+	movdqa	%xmm3,	%xmm15
+	 pxor	%xmm5,	%xmm8		# "pnot"
+	 pxor	%xmm5,	%xmm9
+
+	pand	%xmm6,	%xmm12
+	pand	%xmm6,	%xmm13
+	 movdqa	%xmm8, 0x00($out)	# write bit-sliced round key
+	pcmpeqb	%xmm0,	%xmm12
+	psrlq	\$4,	%xmm0		# 0x01...
+	 movdqa	%xmm9, 0x10($out)
+	pcmpeqb	%xmm1,	%xmm13
+	psrlq	\$4,	%xmm1		# 0x02...
+	 lea	0x10($inp), $inp
+
+	pand	%xmm6,	%xmm14
+	pand	%xmm6,	%xmm15
+	 movdqa	%xmm10, 0x20($out)
+	pcmpeqb	%xmm2,	%xmm14
+	psrlq	\$4,	%xmm2		# 0x04...
+	 movdqa	%xmm11, 0x30($out)
+	pcmpeqb	%xmm3,	%xmm15
+	psrlq	\$4,	%xmm3		# 0x08...
+	 movdqu	($inp), %xmm6		# load next round key
+
+	pxor	%xmm5, %xmm13		# "pnot"
+	pxor	%xmm5, %xmm14
+	movdqa	%xmm12, 0x40($out)
+	movdqa	%xmm13, 0x50($out)
+	movdqa	%xmm14, 0x60($out)
+	movdqa	%xmm15, 0x70($out)
+	lea	0x80($out),$out
+	dec	$rounds
+	jnz	.Lkey_loop
+
+	movdqa	0x50($const), %xmm7	# .L63
+	#movdqa	%xmm6, ($out)		# don't save last round key
+	ret
+.size	_bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) {	# following four functions are unsupported interface
+			# used for benchmarking...
+$code.=<<___;
+.globl	bsaes_enc_key_convert
+.type	bsaes_enc_key_convert,\@function,2
+.align	16
+bsaes_enc_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+	ret
+.size	bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl	bsaes_encrypt_128
+.type	bsaes_encrypt_128,\@function,4
+.align	16
+bsaes_encrypt_128:
+.Lenc128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Lenc128_loop
+	ret
+.size	bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl	bsaes_dec_key_convert
+.type	bsaes_dec_key_convert,\@function,2
+.align	16
+bsaes_dec_key_convert:
+	mov	240($inp),%r10d		# pass rounds
+	mov	$inp,%rcx		# pass key
+	mov	$out,%rax		# pass key schedule
+	call	_bsaes_key_convert
+	pxor	($out),%xmm7		# fix up round 0 key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,($out)
+	ret
+.size	bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl	bsaes_decrypt_128
+.type	bsaes_decrypt_128,\@function,4
+.align	16
+bsaes_decrypt_128:
+.Ldec128_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	movdqu	0x60($inp), @XMM[6]
+	movdqu	0x70($inp), @XMM[7]
+	mov	$key, %rax		# pass the $key
+	lea	0x80($inp), $inp
+	mov	\$10,%r10d
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$0x80,$len
+	ja	.Ldec128_loop
+	ret
+.size	bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64	? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+						: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl	bsaes_ecb_encrypt_blocks
+.type	bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_encrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_enc_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	sub	\$8,$len
+.Lecb_enc_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_encrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_enc_loop
+
+	add	\$8,$len
+	jz	.Lecb_enc_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_enc_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_enc_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_enc_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_enc_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_enc_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_enc_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_six:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_five:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_four:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_three:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_two:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_one:
+	call	_bsaes_encrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_enc_done
+.align	16
+.Lecb_enc_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_enc_short
+
+.Lecb_enc_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_enc_epilogue:
+	ret
+.size	bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl	bsaes_ecb_decrypt_blocks
+.type	bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ecb_decrypt_blocks:
+	mov	%rsp, %rax
+.Lecb_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+	mov	%rsp,%rbp		# backup %rsp
+	mov	240($arg4),%eax		# rounds
+	mov	$arg1,$inp		# backup arguments
+	mov	$arg2,$out
+	mov	$arg3,$len
+	mov	$arg4,$key
+	cmp	\$8,$arg3
+	jb	.Lecb_dec_short
+
+	mov	%eax,%ebx		# backup rounds
+	shl	\$7,%rax		# 128 bytes per inner round key
+	sub	\$`128-32`,%rax		# size of bit-sliced key schedule
+	sub	%rax,%rsp
+	mov	%rsp,%rax		# pass key schedule
+	mov	$key,%rcx		# pass key
+	mov	%ebx,%r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	sub	\$8,$len
+.Lecb_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%ebx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	lea	0x80($inp), $inp
+
+	call	_bsaes_decrypt8
+
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lecb_dec_loop
+
+	add	\$8,$len
+	jz	.Lecb_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%ebx,%r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lecb_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lecb_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lecb_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lecb_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lecb_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lecb_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_six:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_five:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_four:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_three:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_two:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_one:
+	call	_bsaes_decrypt8
+	movdqu	@XMM[0], 0x00($out)	# write output
+	jmp	.Lecb_dec_done
+.align	16
+.Lecb_dec_short:
+	lea	($inp), $arg1
+	lea	($out), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt
+	lea	16($inp), $inp
+	lea	16($out), $out
+	dec	$len
+	jnz	.Lecb_dec_short
+
+.Lecb_dec_done:
+	lea	(%rsp),%rax
+	pxor	%xmm0, %xmm0
+.Lecb_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	jb	.Lecb_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lecb_dec_epilogue:
+	ret
+.size	bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern	asm_AES_cbc_encrypt
+.globl	bsaes_cbc_encrypt
+.type	bsaes_cbc_encrypt,\@abi-omnipotent
+.align	16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+	mov	48(%rsp),$arg6		# pull direction flag
+___
+$code.=<<___;
+	cmp	\$0,$arg6
+	jne	asm_AES_cbc_encrypt
+	cmp	\$128,$arg3
+	jb	asm_AES_cbc_encrypt
+
+	mov	%rsp, %rax
+.Lcbc_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	mov	$arg5, %rbx
+	shr	\$4, $len		# bytes to blocks
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp),%xmm7		# fix up 0 round key
+	movdqa	%xmm6,(%rax)		# save last round key
+	movdqa	%xmm7,(%rsp)
+
+	movdqu	(%rbx), @XMM[15]	# load IV
+	sub	\$8,$len
+.Lcbc_dec_loop:
+	movdqu	0x00($inp), @XMM[0]	# load input
+	movdqu	0x10($inp), @XMM[1]
+	movdqu	0x20($inp), @XMM[2]
+	movdqu	0x30($inp), @XMM[3]
+	movdqu	0x40($inp), @XMM[4]
+	movdqu	0x50($inp), @XMM[5]
+	mov	%rsp, %rax		# pass key schedule
+	movdqu	0x60($inp), @XMM[6]
+	mov	%edx,%r10d		# pass rounds
+	movdqu	0x70($inp), @XMM[7]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+
+	call	_bsaes_decrypt8
+
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[13], @XMM[3]
+	movdqu	0x70($inp), @XMM[15]	# IV
+	pxor	@XMM[14], @XMM[5]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x80($inp), $inp
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	sub	\$8,$len
+	jnc	.Lcbc_dec_loop
+
+	add	\$8,$len
+	jz	.Lcbc_dec_done
+
+	movdqu	0x00($inp), @XMM[0]	# load input
+	mov	%rsp, %rax		# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+	cmp	\$2,$len
+	jb	.Lcbc_dec_one
+	movdqu	0x10($inp), @XMM[1]
+	je	.Lcbc_dec_two
+	movdqu	0x20($inp), @XMM[2]
+	cmp	\$4,$len
+	jb	.Lcbc_dec_three
+	movdqu	0x30($inp), @XMM[3]
+	je	.Lcbc_dec_four
+	movdqu	0x40($inp), @XMM[4]
+	cmp	\$6,$len
+	jb	.Lcbc_dec_five
+	movdqu	0x50($inp), @XMM[5]
+	je	.Lcbc_dec_six
+	movdqu	0x60($inp), @XMM[6]
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[12], @XMM[7]
+	movdqu	0x60($inp), @XMM[15]	# IV
+	pxor	@XMM[13], @XMM[3]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_six:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[11], @XMM[2]
+	movdqu	0x50($inp), @XMM[15]	# IV
+	pxor	@XMM[12], @XMM[7]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_five:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	0x40($inp), @XMM[15]	# IV
+	pxor	@XMM[11], @XMM[2]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_four:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[9], @XMM[6]
+	movdqu	0x30($inp), @XMM[15]	# IV
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_three:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[8], @XMM[1]
+	movdqu	0x20($inp), @XMM[15]	# IV
+	pxor	@XMM[9], @XMM[6]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_two:
+	movdqa	@XMM[15], 0x20(%rbp)	# put aside IV
+	call	_bsaes_decrypt8
+	pxor	0x20(%rbp), @XMM[0]	# ^= IV
+	movdqu	0x00($inp), @XMM[8]	# re-load input
+	movdqu	0x10($inp), @XMM[15]	# IV
+	pxor	@XMM[8], @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	jmp	.Lcbc_dec_done
+.align	16
+.Lcbc_dec_one:
+	lea	($inp), $arg1
+	lea	0x20(%rbp), $arg2	# buffer output
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[15]	# ^= IV
+	movdqu	@XMM[15], ($out)	# write output
+	movdqa	@XMM[0], @XMM[15]	# IV
+
+.Lcbc_dec_done:
+	movdqu	@XMM[15], (%rbx)	# return IV
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lcbc_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lcbc_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lcbc_dec_epilogue:
+	ret
+.size	bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl	bsaes_ctr32_encrypt_blocks
+.type	bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align	16
+bsaes_ctr32_encrypt_blocks:
+	mov	%rsp, %rax
+.Lctr_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	movdqu	($arg5), %xmm0		# load counter
+	mov	240($arg4), %eax	# rounds
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+	movdqa	%xmm0, 0x20(%rbp)	# copy counter
+	cmp	\$8, $arg3
+	jb	.Lctr_enc_short
+
+	mov	%eax, %ebx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%ebx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6,%xmm7		# fix up last round key
+	movdqa	%xmm7,(%rax)		# save last round key
+
+	movdqa	(%rsp), @XMM[9]		# load round0 key
+	lea	.LADD1(%rip), %r11
+	movdqa	0x20(%rbp), @XMM[0]	# counter copy
+	movdqa	-0x20(%r11), @XMM[8]	# .LSWPUP
+	pshufb	@XMM[8], @XMM[9]	# byte swap upper part
+	pshufb	@XMM[8], @XMM[0]
+	movdqa	@XMM[9], (%rsp)		# save adjusted round0 key
+	jmp	.Lctr_enc_loop
+.align	16
+.Lctr_enc_loop:
+	movdqa	@XMM[0], 0x20(%rbp)	# save counter
+	movdqa	@XMM[0], @XMM[1]	# prepare 8 counter values
+	movdqa	@XMM[0], @XMM[2]
+	paddd	0x00(%r11), @XMM[1]	# .LADD1
+	movdqa	@XMM[0], @XMM[3]
+	paddd	0x10(%r11), @XMM[2]	# .LADD2
+	movdqa	@XMM[0], @XMM[4]
+	paddd	0x20(%r11), @XMM[3]	# .LADD3
+	movdqa	@XMM[0], @XMM[5]
+	paddd	0x30(%r11), @XMM[4]	# .LADD4
+	movdqa	@XMM[0], @XMM[6]
+	paddd	0x40(%r11), @XMM[5]	# .LADD5
+	movdqa	@XMM[0], @XMM[7]
+	paddd	0x50(%r11), @XMM[6]	# .LADD6
+	paddd	0x60(%r11), @XMM[7]	# .LADD7
+
+	# Borrow prologue from _bsaes_encrypt8 to use the opportunity
+	# to flip byte order in 32-bit counter
+	movdqa	(%rsp), @XMM[9]		# round 0 key
+	lea	0x10(%rsp), %rax	# pass key schedule
+	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
+	pxor	@XMM[9], @XMM[0]	# xor with round0 key
+	pxor	@XMM[9], @XMM[1]
+	 pshufb	@XMM[8], @XMM[0]
+	pxor	@XMM[9], @XMM[2]
+	 pshufb	@XMM[8], @XMM[1]
+	pxor	@XMM[9], @XMM[3]
+	 pshufb	@XMM[8], @XMM[2]
+	pxor	@XMM[9], @XMM[4]
+	 pshufb	@XMM[8], @XMM[3]
+	pxor	@XMM[9], @XMM[5]
+	 pshufb	@XMM[8], @XMM[4]
+	pxor	@XMM[9], @XMM[6]
+	 pshufb	@XMM[8], @XMM[5]
+	pxor	@XMM[9], @XMM[7]
+	 pshufb	@XMM[8], @XMM[6]
+	lea	.LBS0(%rip), %r11	# constants table
+	 pshufb	@XMM[8], @XMM[7]
+	mov	%ebx,%r10d		# pass rounds
+
+	call	_bsaes_encrypt8_bitslice
+
+	sub	\$8,$len
+	jc	.Lctr_enc_loop_done
+
+	movdqu	0x00($inp), @XMM[8]	# load input
+	movdqu	0x10($inp), @XMM[9]
+	movdqu	0x20($inp), @XMM[10]
+	movdqu	0x30($inp), @XMM[11]
+	movdqu	0x40($inp), @XMM[12]
+	movdqu	0x50($inp), @XMM[13]
+	movdqu	0x60($inp), @XMM[14]
+	movdqu	0x70($inp), @XMM[15]
+	lea	0x80($inp),$inp
+	pxor	@XMM[0], @XMM[8]
+	movdqa	0x20(%rbp), @XMM[0]	# load counter
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[8], 0x00($out)	# write output
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	@XMM[15], @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	lea	.LADD1(%rip), %r11
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+	paddd	0x70(%r11), @XMM[0]	# .LADD8
+	jnz	.Lctr_enc_loop
+
+	jmp	.Lctr_enc_done
+.align	16
+.Lctr_enc_loop_done:
+	add	\$8, $len
+	movdqu	0x00($inp), @XMM[8]	# load input
+	pxor	@XMM[8], @XMM[0]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	cmp	\$2,$len
+	jb	.Lctr_enc_done
+	movdqu	0x10($inp), @XMM[9]
+	pxor	@XMM[9], @XMM[1]
+	movdqu	@XMM[1], 0x10($out)
+	je	.Lctr_enc_done
+	movdqu	0x20($inp), @XMM[10]
+	pxor	@XMM[10], @XMM[4]
+	movdqu	@XMM[4], 0x20($out)
+	cmp	\$4,$len
+	jb	.Lctr_enc_done
+	movdqu	0x30($inp), @XMM[11]
+	pxor	@XMM[11], @XMM[6]
+	movdqu	@XMM[6], 0x30($out)
+	je	.Lctr_enc_done
+	movdqu	0x40($inp), @XMM[12]
+	pxor	@XMM[12], @XMM[3]
+	movdqu	@XMM[3], 0x40($out)
+	cmp	\$6,$len
+	jb	.Lctr_enc_done
+	movdqu	0x50($inp), @XMM[13]
+	pxor	@XMM[13], @XMM[7]
+	movdqu	@XMM[7], 0x50($out)
+	je	.Lctr_enc_done
+	movdqu	0x60($inp), @XMM[14]
+	pxor	@XMM[14], @XMM[2]
+	movdqu	@XMM[2], 0x60($out)
+	jmp	.Lctr_enc_done
+
+.align	16
+.Lctr_enc_short:
+	lea	0x20(%rbp), $arg1
+	lea	0x30(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt
+	movdqu	($inp), @XMM[1]
+	lea	16($inp), $inp
+	mov	0x2c(%rbp), %eax	# load 32-bit counter
+	bswap	%eax
+	pxor	0x30(%rbp), @XMM[1]
+	inc	%eax			# increment
+	movdqu	@XMM[1], ($out)
+	bswap	%eax
+	lea	16($out), $out
+	mov	%eax, 0x2c(%rsp)	# save 32-bit counter
+	dec	$len
+	jnz	.Lctr_enc_short
+
+.Lctr_enc_done:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lctr_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lctr_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lctr_enc_epilogue:
+	ret
+.size	bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#	const AES_KEY *key1, const AES_KEY *key2,
+#	const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$code.=<<___;
+.globl	bsaes_xts_encrypt
+.type	bsaes_xts_encrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_encrypt:
+	mov	%rsp, %rax
+.Lxts_enc_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	%xmm6, %xmm7		# fix up last round key
+	movdqa	%xmm7, (%rax)		# save last round key
+
+	and	\$-16, $len
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_enc_short
+	jmp	.Lxts_enc_loop
+
+.align	16
+.Lxts_enc_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[2], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_enc_loop
+
+.Lxts_enc_short:
+	add	\$0x80, $len
+	jz	.Lxts_enc_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_enc_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	pxor	0x60(%rsp), @XMM[2]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[2], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[3], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	pxor	0x40(%rsp), @XMM[3]
+	movdqu	@XMM[6], 0x30($out)
+	movdqu	@XMM[3], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[6]
+	movdqu	@XMM[4], 0x20($out)
+	movdqu	@XMM[6], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[4]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[4], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_encrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_enc_done
+.align	16
+.Lxts_enc_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_encrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_enc_done:
+	and	\$15, %ebx
+	jz	.Lxts_enc_ret
+	mov	$out, %rdx
+
+.Lxts_enc_steal:
+	movzb	($inp), %eax
+	movzb	-16(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, -16(%rdx)
+	mov	%cl, 0(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_enc_steal
+
+	movdqu	-16($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_encrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	movdqu	@XMM[7], -16($out)
+
+.Lxts_enc_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_enc_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_enc_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_enc_epilogue:
+	ret
+.size	bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl	bsaes_xts_decrypt
+.type	bsaes_xts_decrypt,\@abi-omnipotent
+.align	16
+bsaes_xts_decrypt:
+	mov	%rsp, %rax
+.Lxts_dec_prologue:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	lea	-0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+	mov	0xa0(%rsp),$arg5	# pull key2
+	mov	0xa8(%rsp),$arg6	# pull ivp
+	lea	-0xa0(%rsp), %rsp
+	movaps	%xmm6, 0x40(%rsp)
+	movaps	%xmm7, 0x50(%rsp)
+	movaps	%xmm8, 0x60(%rsp)
+	movaps	%xmm9, 0x70(%rsp)
+	movaps	%xmm10, 0x80(%rsp)
+	movaps	%xmm11, 0x90(%rsp)
+	movaps	%xmm12, 0xa0(%rsp)
+	movaps	%xmm13, 0xb0(%rsp)
+	movaps	%xmm14, 0xc0(%rsp)
+	movaps	%xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+	mov	%rsp, %rbp		# backup %rsp
+	mov	$arg1, $inp		# backup arguments
+	mov	$arg2, $out
+	mov	$arg3, $len
+	mov	$arg4, $key
+
+	lea	($arg6), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($arg5), $arg3
+	call	asm_AES_encrypt		# generate initial tweak
+
+	mov	240($key), %eax		# rounds
+	mov	$len, %rbx		# backup $len
+
+	mov	%eax, %edx		# rounds
+	shl	\$7, %rax		# 128 bytes per inner round key
+	sub	\$`128-32`, %rax	# size of bit-sliced key schedule
+	sub	%rax, %rsp
+
+	mov	%rsp, %rax		# pass key schedule
+	mov	$key, %rcx		# pass key
+	mov	%edx, %r10d		# pass rounds
+	call	_bsaes_key_convert
+	pxor	(%rsp), %xmm7		# fix up round 0 key
+	movdqa	%xmm6, (%rax)		# save last round key
+	movdqa	%xmm7, (%rsp)
+
+	xor	%eax, %eax		# if ($len%16) len-=16;
+	and	\$-16, $len
+	test	\$15, %ebx
+	setnz	%al
+	shl	\$4, %rax
+	sub	%rax, $len
+
+	sub	\$0x80, %rsp		# place for tweak[8]
+	movdqa	0x20(%rbp), @XMM[7]	# initial tweak
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+
+	sub	\$0x80, $len
+	jc	.Lxts_dec_short
+	jmp	.Lxts_dec_loop
+
+.align	16
+.Lxts_dec_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqu	0x70($inp), @XMM[8+7]
+	lea	0x80($inp), $inp
+	movdqa	@XMM[7], 0x70(%rsp)
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	pxor	@XMM[8+7], @XMM[7]
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	pxor	0x70(%rsp), @XMM[5]
+	movdqu	@XMM[3], 0x60($out)
+	movdqu	@XMM[5], 0x70($out)
+	lea	0x80($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# prepare next iteration tweak
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+
+	sub	\$0x80,$len
+	jnc	.Lxts_dec_loop
+
+.Lxts_dec_short:
+	add	\$0x80, $len
+	jz	.Lxts_dec_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+	pshufd	\$0x13, $twtmp, $twres
+	pxor	$twtmp, $twtmp
+	movdqa	@XMM[7], @XMM[$i]
+	movdqa	@XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+	paddq	@XMM[7], @XMM[7]	# psllq	1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	pcmpgtd	@XMM[7], $twtmp		# broadcast upper bits
+	pxor	$twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+	movdqu	`0x10*($i-1)`($inp), @XMM[8+$i-1]
+	cmp	\$`0x10*$i`,$len
+	je	.Lxts_dec_$i
+___
+    $code.=<<___ if ($i>=2);
+	pxor	@XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+	movdqu	0x60($inp), @XMM[8+6]
+	pxor	@XMM[8+5], @XMM[5]
+	movdqa	@XMM[7], 0x70(%rsp)
+	lea	0x70($inp), $inp
+	pxor	@XMM[8+6], @XMM[6]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	pxor	0x60(%rsp), @XMM[3]
+	movdqu	@XMM[7], 0x50($out)
+	movdqu	@XMM[3], 0x60($out)
+	lea	0x70($out), $out
+
+	movdqa	0x70(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_6:
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x60($inp), $inp
+	pxor	@XMM[8+5], @XMM[5]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	pxor	0x50(%rsp), @XMM[7]
+	movdqu	@XMM[2], 0x40($out)
+	movdqu	@XMM[7], 0x50($out)
+	lea	0x60($out), $out
+
+	movdqa	0x60(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_5:
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x50($inp), $inp
+	pxor	@XMM[8+4], @XMM[4]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	pxor	0x40(%rsp), @XMM[2]
+	movdqu	@XMM[4], 0x30($out)
+	movdqu	@XMM[2], 0x40($out)
+	lea	0x50($out), $out
+
+	movdqa	0x50(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_4:
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x40($inp), $inp
+	pxor	@XMM[8+3], @XMM[3]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	pxor	0x30(%rsp), @XMM[4]
+	movdqu	@XMM[6], 0x20($out)
+	movdqu	@XMM[4], 0x30($out)
+	lea	0x40($out), $out
+
+	movdqa	0x40(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_3:
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x30($inp), $inp
+	pxor	@XMM[8+2], @XMM[2]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	pxor	0x20(%rsp), @XMM[6]
+	movdqu	@XMM[1], 0x10($out)
+	movdqu	@XMM[6], 0x20($out)
+	lea	0x30($out), $out
+
+	movdqa	0x30(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_2:
+	pxor	@XMM[8+0], @XMM[0]
+	lea	0x20($inp), $inp
+	pxor	@XMM[8+1], @XMM[1]
+	lea	0x80(%rsp), %rax	# pass key schedule
+	mov	%edx, %r10d		# pass rounds
+
+	call	_bsaes_decrypt8
+
+	pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	pxor	0x10(%rsp), @XMM[1]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	movdqu	@XMM[1], 0x10($out)
+	lea	0x20($out), $out
+
+	movdqa	0x20(%rsp), @XMM[7]	# next iteration tweak
+	jmp	.Lxts_dec_done
+.align	16
+.Lxts_dec_1:
+	pxor	@XMM[0], @XMM[8]
+	lea	0x10($inp), $inp
+	movdqa	@XMM[8], 0x20(%rbp)
+	lea	0x20(%rbp), $arg1
+	lea	0x20(%rbp), $arg2
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[0]	# ^= tweak[]
+	#pxor	@XMM[8], @XMM[0]
+	#lea	0x80(%rsp), %rax	# pass key schedule
+	#mov	%edx, %r10d		# pass rounds
+	#call	_bsaes_decrypt8
+	#pxor	0x00(%rsp), @XMM[0]	# ^= tweak[]
+	movdqu	@XMM[0], 0x00($out)	# write output
+	lea	0x10($out), $out
+
+	movdqa	0x10(%rsp), @XMM[7]	# next iteration tweak
+
+.Lxts_dec_done:
+	and	\$15, %ebx
+	jz	.Lxts_dec_ret
+
+	pxor	$twtmp, $twtmp
+	movdqa	.Lxts_magic(%rip), $twmask
+	pcmpgtd	@XMM[7], $twtmp
+	pshufd	\$0x13, $twtmp, $twres
+	movdqa	@XMM[7], @XMM[6]
+	paddq	@XMM[7], @XMM[7]	# psllq 1,$tweak
+	pand	$twmask, $twres		# isolate carry and residue
+	movdqu	($inp), @XMM[0]
+	pxor	$twres, @XMM[7]
+
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[7], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[7]
+	mov	$out, %rdx
+	movdqu	@XMM[7], ($out)
+
+.Lxts_dec_steal:
+	movzb	16($inp), %eax
+	movzb	(%rdx), %ecx
+	lea	1($inp), $inp
+	mov	%al, (%rdx)
+	mov	%cl, 16(%rdx)
+	lea	1(%rdx), %rdx
+	sub	\$1,%ebx
+	jnz	.Lxts_dec_steal
+
+	movdqu	($out), @XMM[0]
+	lea	0x20(%rbp), $arg1
+	pxor	@XMM[6], @XMM[0]
+	lea	0x20(%rbp), $arg2
+	movdqa	@XMM[0], 0x20(%rbp)
+	lea	($key), $arg3
+	call	asm_AES_decrypt		# doesn't touch %xmm
+	pxor	0x20(%rbp), @XMM[6]
+	movdqu	@XMM[6], ($out)
+
+.Lxts_dec_ret:
+	lea	(%rsp), %rax
+	pxor	%xmm0, %xmm0
+.Lxts_dec_bzero:			# wipe key schedule [if any]
+	movdqa	%xmm0, 0x00(%rax)
+	movdqa	%xmm0, 0x10(%rax)
+	lea	0x20(%rax), %rax
+	cmp	%rax, %rbp
+	ja	.Lxts_dec_bzero
+
+	lea	(%rbp),%rsp		# restore %rsp
+___
+$code.=<<___ if ($win64);
+	movaps	0x40(%rbp), %xmm6
+	movaps	0x50(%rbp), %xmm7
+	movaps	0x60(%rbp), %xmm8
+	movaps	0x70(%rbp), %xmm9
+	movaps	0x80(%rbp), %xmm10
+	movaps	0x90(%rbp), %xmm11
+	movaps	0xa0(%rbp), %xmm12
+	movaps	0xb0(%rbp), %xmm13
+	movaps	0xc0(%rbp), %xmm14
+	movaps	0xd0(%rbp), %xmm15
+	lea	0xa0(%rbp), %rsp
+___
+$code.=<<___;
+	mov	0x48(%rsp), %r15
+	mov	0x50(%rsp), %r14
+	mov	0x58(%rsp), %r13
+	mov	0x60(%rsp), %r12
+	mov	0x68(%rsp), %rbx
+	mov	0x70(%rsp), %rax
+	lea	0x78(%rsp), %rsp
+	mov	%rax, %rbp
+.Lxts_dec_epilogue:
+	ret
+.size	bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type	_bsaes_const,\@object
+.align	64
+_bsaes_const:
+.LM0ISR:	# InvShiftRows constants
+	.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+	.quad	0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+	.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:		# bit-slice constants
+	.quad	0x5555555555555555, 0x5555555555555555
+.LBS1:
+	.quad	0x3333333333333333, 0x3333333333333333
+.LBS2:
+	.quad	0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:		# shiftrows constants
+	.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+	.quad	0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0SR:
+	.quad	0x0a0e02060f03070b, 0x0004080c05090d01
+.LSWPUP:	# byte-swap upper dword
+	.quad	0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+	.quad	0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:		# counter increment constants
+	.quad	0x0000000000000000, 0x0000000100000000
+.LADD2:
+	.quad	0x0000000000000000, 0x0000000200000000
+.LADD3:
+	.quad	0x0000000000000000, 0x0000000300000000
+.LADD4:
+	.quad	0x0000000000000000, 0x0000000400000000
+.LADD5:
+	.quad	0x0000000000000000, 0x0000000500000000
+.LADD6:
+	.quad	0x0000000000000000, 0x0000000600000000
+.LADD7:
+	.quad	0x0000000000000000, 0x0000000700000000
+.LADD8:
+	.quad	0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+	.long	0x87,0,1,0
+.Lmasks:
+	.quad	0x0101010101010101, 0x0101010101010101
+	.quad	0x0202020202020202, 0x0202020202020202
+	.quad	0x0404040404040404, 0x0404040404040404
+	.quad	0x0808080808080808, 0x0808080808080808
+.LM0:
+	.quad	0x02060a0e03070b0f, 0x0004080c0105090d
+.L63:
+	.quad	0x6363636363636363, 0x6363636363636363
+.asciz	"Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align	64
+.size	_bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	160($context),%rax	# pull context->Rbp
+
+	lea	0x40(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xa0(%rax),%rax		# adjust stack pointer
+
+	mov	0x70(%rax),%rbp
+	mov	0x68(%rax),%rbx
+	mov	0x60(%rax),%r12
+	mov	0x58(%rax),%r13
+	mov	0x50(%rax),%r14
+	mov	0x48(%rax),%r15
+	lea	0x78(%rax),%rax		# adjust stack pointer
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+___
+$code.=<<___ if ($ecb);
+	.rva	.Lecb_enc_prologue
+	.rva	.Lecb_enc_epilogue
+	.rva	.Lecb_enc_info
+
+	.rva	.Lecb_dec_prologue
+	.rva	.Lecb_dec_epilogue
+	.rva	.Lecb_dec_info
+___
+$code.=<<___;
+	.rva	.Lcbc_dec_prologue
+	.rva	.Lcbc_dec_epilogue
+	.rva	.Lcbc_dec_info
+
+	.rva	.Lctr_enc_prologue
+	.rva	.Lctr_enc_epilogue
+	.rva	.Lctr_enc_info
+
+	.rva	.Lxts_enc_prologue
+	.rva	.Lxts_enc_epilogue
+	.rva	.Lxts_enc_info
+
+	.rva	.Lxts_dec_prologue
+	.rva	.Lxts_dec_epilogue
+	.rva	.Lxts_dec_info
+
+.section	.xdata
+.align	8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_enc_body,.Lecb_enc_epilogue	# HandlerData[]
+.Lecb_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lecb_dec_body,.Lecb_dec_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_dec_body,.Lcbc_dec_epilogue	# HandlerData[]
+.Lctr_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lctr_enc_body,.Lctr_enc_epilogue	# HandlerData[]
+.Lxts_enc_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_enc_body,.Lxts_enc_epilogue	# HandlerData[]
+.Lxts_dec_info:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lxts_dec_body,.Lxts_dec_epilogue	# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
new file mode 100644
index 0000000000..1533e2c304
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,903 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#		aes-586.pl		vpaes-x86.pl
+#
+# Core 2(**)	29.1/42.3/18.3		22.0/25.6(***)
+# Nehalem	27.9/40.4/18.1		10.3/12.0
+# Atom		102./119./60.1		64.5/85.3(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +32%/65%  improvement on Core 2
+#	and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#	code path).
+#
+#						<appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;		# inv, inva
+	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10;		# s0F
+	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00;		# input transform (lo, hi)
+	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20;		# sb1u, sb1t
+	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;		# sb2u, sb2t
+	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;		# sbou, sbot
+	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80;	# mc_forward
+	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0;	# mc_backward
+	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100;		# sr
+	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140;		# rcon
+	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150;		# s63: all equal to 0x63 transformed
+	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160;		# output transform
+	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
+	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+$k_dksd=0x1a0;		# decryption key schedule: invskew x*D
+	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0;		# decryption key schedule: invskew x*B
+	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
+	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200;		# decryption key schedule: invskew x*9
+	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+$k_dipt=0x220;		# decryption input transform
+	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
+	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
+	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
+	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
+	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0;		# decryption sbox final output
+	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align	(64);
+
+&function_begin_B("_vpaes_preheat");
+	&add	($const,&DWP(0,"esp"));
+	&movdqa	("xmm7",&QWP($k_inv,$const));
+	&movdqa	("xmm6",&QWP($k_s0F,$const));
+	&ret	();
+&function_end_B("_vpaes_preheat");
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+	&mov	($magic,16);
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6")
+	&movdqa	("xmm2",&QWP($k_ipt,$const));
+	&pandn	("xmm1","xmm0");
+	&movdqu	("xmm5",&QWP(0,$key));
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm2","xmm5");
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($base,&DWP($k_mc_backward,$const));
+	&jmp	(&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+	# middle of middle round
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
+	&pshufb	("xmm4","xmm2");		# 4 = sb1u
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
+	&pshufb	("xmm5","xmm2");		# 4 = sb2u
+	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+	&pshufb	("xmm2","xmm3");		# 2 = sb2t
+	&pxor	("xmm2","xmm5");		# 2 = 2A
+	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
+	&movdqa	("xmm3","xmm0");		# 3 = A
+	&pshufb	("xmm0","xmm1");		# 0 = B
+	&add	($key,16);			# next key
+	&pxor	("xmm0","xmm2");		# 0 = 2A+B
+	&pshufb	("xmm3","xmm4");		# 3 = D
+	&add	($magic,16);			# next mc
+	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
+	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
+	&and	($magic,0x30);			# ... mod 4
+	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
+	&sub	($round,1);			# nr--
+
+&set_label("enc_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm5","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("enc_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
+	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm1");
+	&ret	();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+	&mov	($round,&DWP(240,$key));
+	&lea	($base,&DWP($k_dsbd,$const));
+	&movdqa	("xmm1","xmm6");
+	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+	&pandn	("xmm1","xmm0");
+	&mov	($magic,$round);
+	&psrld	("xmm1",4)
+	&movdqu	("xmm5",&QWP(0,$key));
+	&shl	($magic,4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+	&xor	($magic,0x30);
+	&pshufb	("xmm0","xmm1");
+	&and	($magic,0x30);
+	&pxor	("xmm2","xmm5");
+	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+	&jmp	(&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+##  Inverse mix columns
+##
+	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
+	&pshufb	("xmm4","xmm2");		# 4 = sb9u
+	&pxor	("xmm4","xmm0");
+	&movdqa	("xmm0",&QWP(-0x10,$base));	# 0 : sb9t
+	&pshufb	("xmm0","xmm3");		# 0 = sb9t
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&add	($key,16);			# next round key
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
+	&pshufb	("xmm4","xmm2");		# 4 = sbdu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x10,$base));	# 0 : sbdt
+	&pshufb	("xmm0","xmm3");		# 0 = sbdt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+	&sub	($round,1);			# nr--
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
+	&pshufb	("xmm4","xmm2");		# 4 = sbbu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x30,$base));	# 0 : sbbt
+	&pshufb	("xmm0","xmm3");		# 0 = sbbt
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
+	&pshufb	("xmm4","xmm2");		# 4 = sbeu
+	&pxor	("xmm4","xmm0");		# 4 = ch
+	&movdqa	("xmm0",&QWP(0x50,$base));	# 0 : sbet
+	&pshufb	("xmm0","xmm3");		# 0 = sbet
+	&pxor	("xmm0","xmm4");		# 0 = ch
+
+	&palignr("xmm5","xmm5",12);
+
+&set_label("dec_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqu	("xmm0",&QWP(0,$key));
+	&jnz	(&label("dec_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
+	&movdqa	("xmm2",&QWP(0,$magic));
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+	&add	($const,&DWP(0,"esp"));
+	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
+	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
+
+	# input transform
+	&movdqa	("xmm3","xmm0");
+	&lea	($base,&DWP($k_ipt,$const));
+	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
+	&call	("_vpaes_schedule_transform");
+	&movdqa	("xmm7","xmm0");
+
+	&test	($out,$out);
+	&jnz	(&label("schedule_am_decrypting"));
+
+	# encrypting, output zeroth round key after transform
+	&movdqu	(&QWP(0,$key),"xmm0");
+	&jmp	(&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+	# decrypting, output zeroth round key after shiftrows
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&xor	($magic,0x30);
+
+&set_label("schedule_go");
+	&cmp	($round,192);
+	&ja	(&label("schedule_256"));
+	&je	(&label("schedule_192"));
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+	&mov	($round,10);
+
+&set_label("loop_schedule_128");
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# write output
+	&jmp	(&label("loop_schedule_128"));
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+&set_label("schedule_192",16);
+	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&movdqa	("xmm6","xmm0");		# save short part
+	&pxor	("xmm4","xmm4");		# clear 4
+	&movhlps("xmm6","xmm4");		# clobber low side with zeros
+	&mov	($round,4);
+
+&set_label("loop_schedule_192");
+	&call	("_vpaes_schedule_round");
+	&palignr("xmm0","xmm6",8);
+	&call	("_vpaes_schedule_mangle");	# save key n
+	&call	("_vpaes_schedule_192_smear");
+	&call	("_vpaes_schedule_mangle");	# save key n+1
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# save key n+2
+	&call	("_vpaes_schedule_192_smear");
+	&jmp	(&label("loop_schedule_192"));
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform	
+	&mov	($round,7);
+
+&set_label("loop_schedule_256");
+	&call	("_vpaes_schedule_mangle");	# output low result
+	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
+
+	# high round
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	
+
+	# low round. swap xmm7 and xmm6
+	&pshufd	("xmm0","xmm0",0xFF);
+	&movdqa	(&QWP(20,"esp"),"xmm7");
+	&movdqa	("xmm7","xmm6");
+	&call	("_vpaes_schedule_low_round");
+	&movdqa	("xmm7",&QWP(20,"esp"));
+
+	&jmp	(&label("loop_schedule_256"));
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+	# schedule last round key from xmm0
+	&lea	($base,&DWP($k_deskew,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_last_dec"));
+
+	# encrypting
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm0","xmm1");		# output permute
+	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
+	&add	($key,32);
+
+&set_label("schedule_mangle_last_dec");
+	&add	($key,-16);
+	&pxor	("xmm0",&QWP($k_s63,$const));
+	&call	("_vpaes_schedule_transform");	# output transform
+	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
+
+	# cleanup
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&pxor	("xmm6","xmm6");
+	&pxor	("xmm7","xmm7");
+	&ret	();
+&function_end_B("_vpaes_schedule_core");
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+	&pshufd	("xmm0","xmm6",0x80);		# d c 0 0 -> c 0 0 0
+	&pxor	("xmm6","xmm0");		# -> c+d c 0 0
+	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
+	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
+	&movdqa	("xmm0","xmm6");
+	&pxor	("xmm1","xmm1");
+	&movhlps("xmm6","xmm1");		# clobber low side with zeros
+	&ret	();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+	# extract rcon from xmm8
+	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
+	&pxor	("xmm1","xmm1");
+	&palignr("xmm1","xmm2",15);
+	&palignr("xmm2","xmm2",15);
+	&pxor	("xmm7","xmm1");
+
+	# rotate
+	&pshufd	("xmm0","xmm0",0xFF);
+	&palignr("xmm0","xmm0",1);
+
+	# fall through...
+	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
+
+	# low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+	# smear xmm7
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",4);
+	&pxor	("xmm7","xmm1");
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",8);
+	&pxor	("xmm7","xmm1");
+	&pxor	("xmm7",&QWP($k_s63,$const));
+
+	# subbyte
+	&movdqa	("xmm4",&QWP($k_s0F,$const));
+	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
+	&movdqa	("xmm1","xmm4");	
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm4");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm5");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm5");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = sbox output
+
+	# add in smeared stuff
+	&pxor	("xmm0","xmm7");
+	&movdqa	("xmm7","xmm0");
+	&ret	();
+&function_end_B("_vpaes_schedule_round");
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm2");
+	&movdqa	("xmm2",&QWP(0,$base));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP(16,$base));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+	&movdqa	("xmm4","xmm0");	# save xmm0 for later
+	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_dec"));
+
+	# encrypting
+	&add	($key,16);
+	&pxor	("xmm4",&QWP($k_s63,$const));
+	&pshufb	("xmm4","xmm5");
+	&movdqa	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+
+	&jmp	(&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+	# inverse mix columns
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&lea	($inp,&DWP($k_dksd,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm4");
+	&psrld	("xmm1",4);			# 1 = hi
+	&pand	("xmm4","xmm2");		# 4 = lo
+
+	&movdqa	("xmm2",&QWP(0,$inp));
+	&pshufb	("xmm2","xmm4");
+	&movdqa	("xmm3",&QWP(0x10,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x20,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x30,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x40,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x50,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x60,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x70,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+
+	&add	($key,-16);
+
+&set_label("schedule_mangle_both");
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&add	($magic,-16);
+	&and	($magic,0x30);
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&ret	();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
+	&mov	($magic,0x30);
+	&mov	($out,0);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
+	&shl	($base,4);
+	&lea	($key,&DWP(16,$key,$base));
+
+	&mov	($out,1);
+	&mov	($magic,$round);
+	&shr	($magic,1);
+	&and	($magic,32);
+	&xor	($magic,32);			# nbist==192?0:32;
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_encrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_decrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));		# inp
+	&mov	($out,&wparam(1));		# out
+	&mov	($round,&wparam(2));		# len
+	&mov	($key,&wparam(3));		# key
+	&sub	($round,16);
+	&jc	(&label("cbc_abort"));
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($const,&wparam(4));		# ivp
+	&and	($base,-16);
+	&mov	($magic,&wparam(5));		# enc
+	&xchg	($base,"esp");			# alloca
+	&movdqu	("xmm1",&QWP(0,$const));	# load IV
+	&sub	($out,$inp);
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	(&DWP(0,"esp"),$out);		# save out
+	&mov	(&DWP(4,"esp"),$key)		# save key
+	&mov	(&DWP(8,"esp"),$const);		# save ivp
+	&mov	($out,$round);			# $out works as $len
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&cmp	($magic,0);
+	&je	(&label("cbc_dec_loop"));
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&pxor	("xmm0","xmm1");		# inp^=iv
+	&call	("_vpaes_encrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&movdqa	("xmm1","xmm0");
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&jmp	(&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
+	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
+	&call	("_vpaes_decrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
+	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+	&mov	($base,&DWP(8,"esp"));		# restore ivp
+	&mov	("esp",&DWP(48,"esp"));
+	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();
diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
new file mode 100644
index 0000000000..37998db5e1
--- /dev/null
+++ b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl
@@ -0,0 +1,1206 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+#		aes-x86_64.pl		vpaes-x86_64.pl
+#
+# Core 2(**)	30.5/43.7/14.3		21.8/25.7(***)
+# Nehalem	30.5/42.2/14.6		 9.8/11.8
+# Atom		63.9/79.0/32.1		64.0/84.8(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +40%/78% improvement on Core 2
+#	(as implied, over "hyper-threading-safe" code path).
+#
+#						<appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type	_vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+	mov	%rdx,	%r9
+	mov	\$16,	%r11
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_ipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_ipt+16(%rip), %xmm0	# ipthi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm5,	%xmm2
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	lea	.Lk_mc_backward(%rip),%r10
+	jmp	.Lenc_entry
+
+.align 16
+.Lenc_loop:
+	# middle of middle round
+	movdqa  %xmm13,	%xmm4	# 4 : sb1u
+	pshufb  %xmm2,	%xmm4	# 4 = sb1u
+	pxor	%xmm5,	%xmm4	# 4 = sb1u + k
+	movdqa  %xmm12,	%xmm0	# 0 : sb1t
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	movdqa  %xmm15,	%xmm5	# 4 : sb2u
+	pshufb	%xmm2,	%xmm5	# 4 = sb2u
+	movdqa	-0x40(%r11,%r10), %xmm1		# .Lk_mc_forward[]
+	movdqa	%xmm14, %xmm2	# 2 : sb2t
+	pshufb	%xmm3,  %xmm2	# 2 = sb2t
+	pxor	%xmm5,	%xmm2	# 2 = 2A
+	movdqa	(%r11,%r10), %xmm4		# .Lk_mc_backward[]
+	movdqa	%xmm0,  %xmm3	# 3 = A
+	pshufb  %xmm1,  %xmm0	# 0 = B
+	add	\$16,	%r9	# next key
+	pxor	%xmm2,  %xmm0	# 0 = 2A+B
+	pshufb	%xmm4,	%xmm3	# 3 = D
+	add	\$16,	%r11	# next mc
+	pxor	%xmm0,	%xmm3	# 3 = 2A+B+D
+	pshufb  %xmm1,	%xmm0	# 0 = 2B+C
+	and	\$0x30,	%r11	# ... mod 4
+	pxor	%xmm3,	%xmm0	# 0 = 2A+3B+C+D
+	sub	\$1,%rax	# nr--
+
+.Lenc_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,   	%xmm1   # 1 = i
+	pand	%xmm9, 	%xmm0   # 0 = k
+	movdqa	%xmm11, %xmm5	# 2 : a/k
+	pshufb  %xmm0,  %xmm5	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3  	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3  	# 3 = 1/i
+	pxor	%xmm5, 	%xmm3  	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4  	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4  	# 4 = 1/j
+	pxor	%xmm5, 	%xmm4  	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2  	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2  	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2  	# 2 = io
+	movdqa	%xmm10, %xmm3   # 3 : 1/jak
+	movdqu	(%r9),	%xmm5
+	pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+	pxor	%xmm1,  %xmm3   # 3 = jo
+	jnz	.Lenc_loop
+
+	# middle of last round
+	movdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
+	movdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm5,  %xmm4	# 4 = sb1u + k
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	movdqa	0x40(%r11,%r10), %xmm1		# .Lk_sr[]
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm1,	%xmm0
+	ret
+.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
+	
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type	_vpaes_decrypt_core,\@abi-omnipotent
+.align	16
+_vpaes_decrypt_core:
+	mov	%rdx,	%r9		# load key
+	mov	240(%rdx),%eax
+	movdqa	%xmm9,	%xmm1
+	movdqa	.Lk_dipt(%rip), %xmm2	# iptlo
+	pandn	%xmm0,	%xmm1
+	mov	%rax,	%r11
+	psrld	\$4,	%xmm1
+	movdqu	(%r9),	%xmm5		# round0 key
+	shl	\$4,	%r11
+	pand	%xmm9,	%xmm0
+	pshufb	%xmm0,	%xmm2
+	movdqa	.Lk_dipt+16(%rip), %xmm0 # ipthi
+	xor	\$0x30,	%r11
+	lea	.Lk_dsbd(%rip),%r10
+	pshufb	%xmm1,	%xmm0
+	and	\$0x30,	%r11
+	pxor	%xmm5,	%xmm2
+	movdqa	.Lk_mc_forward+48(%rip), %xmm5
+	pxor	%xmm2,	%xmm0
+	add	\$16,	%r9
+	add	%r10,	%r11
+	jmp	.Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+##  Inverse mix columns
+##
+	movdqa  -0x20(%r10),%xmm4	# 4 : sb9u
+	pshufb	%xmm2,	%xmm4		# 4 = sb9u
+	pxor	%xmm0,	%xmm4
+	movdqa  -0x10(%r10),%xmm0	# 0 : sb9t
+	pshufb	%xmm3,	%xmm0		# 0 = sb9t
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	add	\$16, %r9		# next round key
+
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x00(%r10),%xmm4	# 4 : sbdu
+	pshufb	%xmm2,	%xmm4		# 4 = sbdu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x10(%r10),%xmm0	# 0 : sbdt
+	pshufb	%xmm3,	%xmm0		# 0 = sbdt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	sub	\$1,%rax		# nr--
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x20(%r10),%xmm4	# 4 : sbbu
+	pshufb	%xmm2,	%xmm4		# 4 = sbbu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x30(%r10),%xmm0	# 0 : sbbt
+	pshufb	%xmm3,	%xmm0		# 0 = sbbt
+	pxor	%xmm4,	%xmm0		# 0 = ch
+	
+	pshufb	%xmm5,	%xmm0		# MC ch
+	movdqa  0x40(%r10),%xmm4	# 4 : sbeu
+	pshufb	%xmm2,	%xmm4		# 4 = sbeu
+	pxor	%xmm0,	%xmm4		# 4 = ch
+	movdqa  0x50(%r10),%xmm0	# 0 : sbet
+	pshufb	%xmm3,	%xmm0		# 0 = sbet
+	pxor	%xmm4,	%xmm0		# 0 = ch
+
+	palignr	\$12,	%xmm5,	%xmm5
+	
+.Ldec_entry:
+	# top of round
+	movdqa  %xmm9, 	%xmm1	# 1 : i
+	pandn	%xmm0, 	%xmm1	# 1 = i<<4
+	psrld	\$4,    %xmm1	# 1 = i
+	pand	%xmm9, 	%xmm0	# 0 = k
+	movdqa	%xmm11, %xmm2	# 2 : a/k
+	pshufb  %xmm0,  %xmm2	# 2 = a/k
+	pxor	%xmm1,	%xmm0	# 0 = j
+	movdqa	%xmm10,	%xmm3	# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3	# 3 = 1/i
+	pxor	%xmm2, 	%xmm3	# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4	# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4	# 4 = 1/j
+	pxor	%xmm2, 	%xmm4	# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2	# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2	# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2	# 2 = io
+	movdqa	%xmm10, %xmm3	# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3	# 3 = 1/jak
+	pxor	%xmm1,  %xmm3	# 3 = jo
+	movdqu	(%r9),	%xmm0
+	jnz	.Ldec_loop
+
+	# middle of last round
+	movdqa	0x60(%r10), %xmm4	# 3 : sbou
+	pshufb  %xmm2,  %xmm4	# 4 = sbou
+	pxor	%xmm0,  %xmm4	# 4 = sb1u + k
+	movdqa	0x70(%r10), %xmm0	# 0 : sbot
+	movdqa	-0x160(%r11), %xmm2	# .Lk_sr-.Lk_dsbd=-0x160
+	pshufb  %xmm3,	%xmm0	# 0 = sb1t
+	pxor	%xmm4,	%xmm0	# 0 = A
+	pshufb	%xmm2,	%xmm0
+	ret
+.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type	_vpaes_schedule_core,\@abi-omnipotent
+.align	16
+_vpaes_schedule_core:
+	# rdi = key
+	# rsi = size in bits
+	# rdx = buffer
+	# rcx = direction.  0=encrypt, 1=decrypt
+
+	call	_vpaes_preheat		# load the tables
+	movdqa	.Lk_rcon(%rip), %xmm8	# load rcon
+	movdqu	(%rdi),	%xmm0		# load key (unaligned)
+
+	# input transform
+	movdqa	%xmm0,	%xmm3
+	lea	.Lk_ipt(%rip), %r11
+	call	_vpaes_schedule_transform
+	movdqa	%xmm0,	%xmm7
+
+	lea	.Lk_sr(%rip),%r10
+	test	%rcx,	%rcx
+	jnz	.Lschedule_am_decrypting
+
+	# encrypting, output zeroth round key after transform
+	movdqu	%xmm0,	(%rdx)
+	jmp	.Lschedule_go
+
+.Lschedule_am_decrypting:
+	# decrypting, output zeroth round key after shiftrows
+	movdqa	(%r8,%r10),%xmm1
+	pshufb  %xmm1,	%xmm3
+	movdqu	%xmm3,	(%rdx)
+	xor	\$0x30, %r8
+
+.Lschedule_go:
+	cmp	\$192,	%esi
+	ja	.Lschedule_256
+	je	.Lschedule_192
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+	mov	\$10, %esi
+	
+.Loop_schedule_128:
+	call 	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# write output
+	jmp 	.Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align	16
+.Lschedule_192:
+	movdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	movdqa	%xmm0,	%xmm6		# save short part
+	pxor	%xmm4,	%xmm4		# clear 4
+	movhlps	%xmm4,	%xmm6		# clobber low side with zeros
+	mov	\$4,	%esi
+
+.Loop_schedule_192:
+	call	_vpaes_schedule_round
+	palignr	\$8,%xmm6,%xmm0	
+	call	_vpaes_schedule_mangle	# save key n
+	call	_vpaes_schedule_192_smear
+	call	_vpaes_schedule_mangle	# save key n+1
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	# save key n+2
+	call	_vpaes_schedule_192_smear
+	jmp	.Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align	16
+.Lschedule_256:
+	movdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
+	call	_vpaes_schedule_transform	# input transform
+	mov	\$7, %esi
+	
+.Loop_schedule_256:
+	call	_vpaes_schedule_mangle	# output low result
+	movdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
+
+	# high round
+	call	_vpaes_schedule_round
+	dec	%rsi
+	jz 	.Lschedule_mangle_last
+	call	_vpaes_schedule_mangle	
+
+	# low round. swap xmm7 and xmm6
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	movdqa	%xmm7,	%xmm5
+	movdqa	%xmm6,	%xmm7
+	call	_vpaes_schedule_low_round
+	movdqa	%xmm5,	%xmm7
+	
+	jmp	.Loop_schedule_256
+
+	
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align	16
+.Lschedule_mangle_last:
+	# schedule last round key from xmm0
+	lea	.Lk_deskew(%rip),%r11	# prepare to deskew
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_last_dec
+
+	# encrypting
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,	%xmm0		# output permute
+	lea	.Lk_opt(%rip),	%r11	# prepare to output transform
+	add	\$32,	%rdx
+
+.Lschedule_mangle_last_dec:
+	add	\$-16,	%rdx
+	pxor	.Lk_s63(%rip),	%xmm0
+	call	_vpaes_schedule_transform # output transform
+	movdqu	%xmm0,	(%rdx)		# save last key
+
+	# cleanup
+	pxor	%xmm0,  %xmm0
+	pxor	%xmm1,  %xmm1
+	pxor	%xmm2,  %xmm2
+	pxor	%xmm3,  %xmm3
+	pxor	%xmm4,  %xmm4
+	pxor	%xmm5,  %xmm5
+	pxor	%xmm6,  %xmm6
+	pxor	%xmm7,  %xmm7
+	ret
+.size	_vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type	_vpaes_schedule_192_smear,\@abi-omnipotent
+.align	16
+_vpaes_schedule_192_smear:
+	pshufd	\$0x80,	%xmm6,	%xmm0	# d c 0 0 -> c 0 0 0
+	pxor	%xmm0,	%xmm6		# -> c+d c 0 0
+	pshufd	\$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
+	pxor	%xmm0,	%xmm6		# -> b+c+d b+c b a
+	movdqa	%xmm6,	%xmm0
+	pxor	%xmm1,	%xmm1
+	movhlps	%xmm1,	%xmm6		# clobber low side with zeros
+	ret
+.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type	_vpaes_schedule_round,\@abi-omnipotent
+.align	16
+_vpaes_schedule_round:
+	# extract rcon from xmm8
+	pxor	%xmm1,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm1
+	palignr	\$15,	%xmm8,	%xmm8
+	pxor	%xmm1,	%xmm7
+
+	# rotate
+	pshufd	\$0xFF,	%xmm0,	%xmm0
+	palignr	\$1,	%xmm0,	%xmm0
+	
+	# fall through...
+	
+	# low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+	# smear xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$4,	%xmm7
+	pxor	%xmm1,	%xmm7
+	movdqa	%xmm7,	%xmm1
+	pslldq	\$8,	%xmm7
+	pxor	%xmm1,	%xmm7
+	pxor	.Lk_s63(%rip), %xmm7
+
+	# subbytes
+	movdqa  %xmm9, 	%xmm1
+	pandn	%xmm0, 	%xmm1
+	psrld	\$4,    %xmm1		# 1 = i
+	pand	%xmm9, 	%xmm0		# 0 = k
+	movdqa	%xmm11, %xmm2		# 2 : a/k
+	pshufb  %xmm0,  %xmm2		# 2 = a/k
+	pxor	%xmm1,	%xmm0		# 0 = j
+	movdqa	%xmm10,	%xmm3		# 3 : 1/i
+	pshufb  %xmm1, 	%xmm3		# 3 = 1/i
+	pxor	%xmm2, 	%xmm3		# 3 = iak = 1/i + a/k
+	movdqa	%xmm10,	%xmm4		# 4 : 1/j
+	pshufb	%xmm0, 	%xmm4		# 4 = 1/j
+	pxor	%xmm2, 	%xmm4		# 4 = jak = 1/j + a/k
+	movdqa	%xmm10,	%xmm2		# 2 : 1/iak
+	pshufb  %xmm3,	%xmm2		# 2 = 1/iak
+	pxor	%xmm0, 	%xmm2		# 2 = io
+	movdqa	%xmm10, %xmm3		# 3 : 1/jak
+	pshufb  %xmm4,  %xmm3		# 3 = 1/jak
+	pxor	%xmm1,  %xmm3		# 3 = jo
+	movdqa	%xmm13, %xmm4		# 4 : sbou
+	pshufb  %xmm2,  %xmm4		# 4 = sbou
+	movdqa	%xmm12, %xmm0		# 0 : sbot
+	pshufb  %xmm3,	%xmm0		# 0 = sb1t
+	pxor	%xmm4, 	%xmm0		# 0 = sbox output
+
+	# add in smeared stuff
+	pxor	%xmm7,	%xmm0	
+	movdqa	%xmm0,	%xmm7
+	ret
+.size	_vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type	_vpaes_schedule_transform,\@abi-omnipotent
+.align	16
+_vpaes_schedule_transform:
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm0,	%xmm1
+	psrld	\$4,	%xmm1
+	pand	%xmm9,	%xmm0
+	movdqa	(%r11), %xmm2 	# lo
+	pshufb	%xmm0,	%xmm2
+	movdqa	16(%r11), %xmm0 # hi
+	pshufb	%xmm1,	%xmm0
+	pxor	%xmm2,	%xmm0
+	ret
+.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type	_vpaes_schedule_mangle,\@abi-omnipotent
+.align	16
+_vpaes_schedule_mangle:
+	movdqa	%xmm0,	%xmm4	# save xmm0 for later
+	movdqa	.Lk_mc_forward(%rip),%xmm5
+	test	%rcx, 	%rcx
+	jnz	.Lschedule_mangle_dec
+
+	# encrypting
+	add	\$16,	%rdx
+	pxor	.Lk_s63(%rip),%xmm4
+	pshufb	%xmm5,	%xmm4
+	movdqa	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+	pshufb	%xmm5,	%xmm4
+	pxor	%xmm4,	%xmm3
+
+	jmp	.Lschedule_mangle_both
+.align	16
+.Lschedule_mangle_dec:
+	# inverse mix columns
+	lea	.Lk_dksd(%rip),%r11
+	movdqa	%xmm9,	%xmm1
+	pandn	%xmm4,	%xmm1
+	psrld	\$4,	%xmm1	# 1 = hi
+	pand	%xmm9,	%xmm4	# 4 = lo
+
+	movdqa	0x00(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	movdqa	0x10(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x20(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x30(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x40(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x50(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+	pshufb	%xmm5,	%xmm3
+
+	movdqa	0x60(%r11), %xmm2
+	pshufb	%xmm4,	%xmm2
+	pxor	%xmm3,	%xmm2
+	movdqa	0x70(%r11), %xmm3
+	pshufb	%xmm1,	%xmm3
+	pxor	%xmm2,	%xmm3
+
+	add	\$-16,	%rdx
+
+.Lschedule_mangle_both:
+	movdqa	(%r8,%r10),%xmm1
+	pshufb	%xmm1,%xmm3
+	add	\$-16,	%r8
+	and	\$0x30,	%r8
+	movdqu	%xmm3,	(%rdx)
+	ret
+.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl	${PREFIX}_set_encrypt_key
+.type	${PREFIX}_set_encrypt_key,\@function,3
+.align	16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+
+	mov	\$0,%ecx
+	mov	\$0x30,%r8d
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl	${PREFIX}_set_decrypt_key
+.type	${PREFIX}_set_decrypt_key,\@function,3
+.align	16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+	mov	%esi,%eax
+	shr	\$5,%eax
+	add	\$5,%eax
+	mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
+	shl	\$4,%eax
+	lea	16(%rdx,%rax),%rdx
+
+	mov	\$1,%ecx
+	mov	%esi,%r8d
+	shr	\$1,%r8d
+	and	\$32,%r8d
+	xor	\$32,%r8d	# nbits==192?0:32
+	call	_vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+	xor	%eax,%eax
+	ret
+.size	${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl	${PREFIX}_encrypt
+.type	${PREFIX}_encrypt,\@function,3
+.align	16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_encrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl	${PREFIX}_decrypt
+.type	${PREFIX}_decrypt,\@function,3
+.align	16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+	movdqu	(%rdi),%xmm0
+	call	_vpaes_preheat
+	call	_vpaes_decrypt_core
+	movdqu	%xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+	ret
+.size	${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+#                       size_t length, const AES_KEY *key,
+#                       unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl	${PREFIX}_cbc_encrypt
+.type	${PREFIX}_cbc_encrypt,\@function,6
+.align	16
+${PREFIX}_cbc_encrypt:
+	xchg	$key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+	sub	\$16,$len
+	jc	.Lcbc_abort
+___
+$code.=<<___ if ($win64);
+	lea	-0xb8(%rsp),%rsp
+	movaps	%xmm6,0x10(%rsp)
+	movaps	%xmm7,0x20(%rsp)
+	movaps	%xmm8,0x30(%rsp)
+	movaps	%xmm9,0x40(%rsp)
+	movaps	%xmm10,0x50(%rsp)
+	movaps	%xmm11,0x60(%rsp)
+	movaps	%xmm12,0x70(%rsp)
+	movaps	%xmm13,0x80(%rsp)
+	movaps	%xmm14,0x90(%rsp)
+	movaps	%xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+	movdqu	($ivp),%xmm6		# load IV
+	sub	$inp,$out
+	call	_vpaes_preheat
+	cmp	\$0,${enc}d
+	je	.Lcbc_dec_loop
+	jmp	.Lcbc_enc_loop
+.align	16
+.Lcbc_enc_loop:
+	movdqu	($inp),%xmm0
+	pxor	%xmm6,%xmm0
+	call	_vpaes_encrypt_core
+	movdqa	%xmm0,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_enc_loop
+	jmp	.Lcbc_done
+.align	16
+.Lcbc_dec_loop:
+	movdqu	($inp),%xmm0
+	movdqa	%xmm0,%xmm7
+	call	_vpaes_decrypt_core
+	pxor	%xmm6,%xmm0
+	movdqa	%xmm7,%xmm6
+	movdqu	%xmm0,($out,$inp)
+	lea	16($inp),$inp
+	sub	\$16,$len
+	jnc	.Lcbc_dec_loop
+.Lcbc_done:
+	movdqu	%xmm6,($ivp)		# save IV
+___
+$code.=<<___ if ($win64);
+	movaps	0x10(%rsp),%xmm6
+	movaps	0x20(%rsp),%xmm7
+	movaps	0x30(%rsp),%xmm8
+	movaps	0x40(%rsp),%xmm9
+	movaps	0x50(%rsp),%xmm10
+	movaps	0x60(%rsp),%xmm11
+	movaps	0x70(%rsp),%xmm12
+	movaps	0x80(%rsp),%xmm13
+	movaps	0x90(%rsp),%xmm14
+	movaps	0xa0(%rsp),%xmm15
+	lea	0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+.Lcbc_abort:
+	ret
+.size	${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type	_vpaes_preheat,\@abi-omnipotent
+.align	16
+_vpaes_preheat:
+	lea	.Lk_s0F(%rip), %r10
+	movdqa	-0x20(%r10), %xmm10	# .Lk_inv
+	movdqa	-0x10(%r10), %xmm11	# .Lk_inv+16
+	movdqa	0x00(%r10), %xmm9	# .Lk_s0F
+	movdqa	0x30(%r10), %xmm13	# .Lk_sb1
+	movdqa	0x40(%r10), %xmm12	# .Lk_sb1+16
+	movdqa	0x50(%r10), %xmm15	# .Lk_sb2
+	movdqa	0x60(%r10), %xmm14	# .Lk_sb2+16
+	ret
+.size	_vpaes_preheat,.-_vpaes_preheat
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+.type	_vpaes_consts,\@object
+.align	64
+_vpaes_consts:
+.Lk_inv:	# inv, inva
+	.quad	0x0E05060F0D080180, 0x040703090A0B0C02
+	.quad	0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:	# s0F
+	.quad	0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:	# input transform (lo, hi)
+	.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
+	.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:	# sb1u, sb1t
+	.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+	.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:	# sb2u, sb2t
+	.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
+	.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:	# sbou, sbot
+	.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
+	.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:	# mc_forward
+	.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
+	.quad	0x080B0A0904070605, 0x000302010C0F0E0D
+	.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
+	.quad	0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+	.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
+	.quad	0x020100030E0D0C0F, 0x0A09080B06050407
+	.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
+	.quad	0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:		# sr
+	.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
+	.quad	0x030E09040F0A0500, 0x0B06010C07020D08
+	.quad	0x0F060D040B020900, 0x070E050C030A0108
+	.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:	# rcon
+	.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:	# s63: all equal to 0x63 transformed
+	.quad	0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:	# output transform
+	.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
+	.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:	# deskew tables: inverts the sbox's "skew"
+	.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+	.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+.Lk_dksd:	# decryption key schedule: invskew x*D
+	.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+	.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:	# decryption key schedule: invskew x*B
+	.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
+	.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:	# decryption key schedule: invskew x*E + 0x63
+	.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
+	.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:	# decryption key schedule: invskew x*9
+	.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
+	.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+.Lk_dipt:	# decryption input transform
+	.quad	0x0F505B040B545F00, 0x154A411E114E451A
+	.quad	0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:	# decryption sbox output *9*u, *9*t
+	.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
+	.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:	# decryption sbox output *D*u, *D*t
+	.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+	.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:	# decryption sbox output *B*u, *B*t
+	.quad	0xD022649296B44200, 0x602646F6B0F2D404
+	.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:	# decryption sbox output *E*u, *E*t
+	.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
+	.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:	# decryption sbox final output
+	.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+	.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz	"Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align	64
+.size	_vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	16(%rax),%rsi		# %xmm save area
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0xb8(%rax),%rax		# adjust stack pointer
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_encrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_encrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_end_${PREFIX}_set_decrypt_key
+	.rva	.LSEH_info_${PREFIX}_set_decrypt_key
+
+	.rva	.LSEH_begin_${PREFIX}_encrypt
+	.rva	.LSEH_end_${PREFIX}_encrypt
+	.rva	.LSEH_info_${PREFIX}_encrypt
+
+	.rva	.LSEH_begin_${PREFIX}_decrypt
+	.rva	.LSEH_end_${PREFIX}_decrypt
+	.rva	.LSEH_info_${PREFIX}_decrypt
+
+	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
+	.rva	.LSEH_info_${PREFIX}_cbc_encrypt
+
+.section	.xdata
+.align	8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_key_body,.Lenc_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_key_body,.Ldec_key_epilogue	# HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lenc_body,.Lenc_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Ldec_body,.Ldec_epilogue		# HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lcbc_body,.Lcbc_epilogue		# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/src/lib/libcrypto/arm_arch.h b/src/lib/libcrypto/arm_arch.h
new file mode 100644
index 0000000000..5a83107680
--- /dev/null
+++ b/src/lib/libcrypto/arm_arch.h
@@ -0,0 +1,51 @@
+#ifndef __ARM_ARCH_H__
+#define __ARM_ARCH_H__
+
+#if !defined(__ARM_ARCH__)
+# if defined(__CC_ARM)
+#  define __ARM_ARCH__ __TARGET_ARCH_ARM
+#  if defined(__BIG_ENDIAN)
+#   define __ARMEB__
+#  else
+#   define __ARMEL__
+#  endif
+# elif defined(__GNUC__)
+  /*
+   * Why doesn't gcc define __ARM_ARCH__? Instead it defines
+   * bunch of below macros. See all_architectires[] table in
+   * gcc/config/arm/arm.c. On a side note it defines
+   * __ARMEL__/__ARMEB__ for little-/big-endian.
+   */
+#  if	defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__)	|| \
+	defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__)	|| \
+	defined(__ARM_ARCH_7EM__)
+#   define __ARM_ARCH__ 7
+#  elif	defined(__ARM_ARCH_6__)	|| defined(__ARM_ARCH_6J__)	|| \
+	defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__)	|| \
+	defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__)	|| \
+	defined(__ARM_ARCH_6T2__)
+#   define __ARM_ARCH__ 6
+#  elif	defined(__ARM_ARCH_5__)	|| defined(__ARM_ARCH_5T__)	|| \
+	defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__)	|| \
+	defined(__ARM_ARCH_5TEJ__)
+#   define __ARM_ARCH__ 5
+#  elif	defined(__ARM_ARCH_4__)	|| defined(__ARM_ARCH_4T__)
+#   define __ARM_ARCH__ 4
+#  else
+#   error "unsupported ARM architecture"
+#  endif
+# endif
+#endif
+
+#ifdef OPENSSL_FIPSCANISTER
+#include <openssl/fipssyms.h>
+#endif
+
+#if !__ASSEMBLER__
+extern unsigned int OPENSSL_armcap_P;
+                                     
+#define ARMV7_NEON      (1<<0)
+#define ARMV7_TICK      (1<<1)
+#endif
+
+#endif
diff --git a/src/lib/libcrypto/armcap.c b/src/lib/libcrypto/armcap.c
new file mode 100644
index 0000000000..5258d2fbdd
--- /dev/null
+++ b/src/lib/libcrypto/armcap.c
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <crypto.h>
+
+#include "arm_arch.h"
+
+unsigned int OPENSSL_armcap_P;
+
+static sigset_t all_masked;
+
+static sigjmp_buf ill_jmp;
+static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
+
+/*
+ * Following subroutines could have been inlined, but it's not all
+ * ARM compilers support inline assembler...
+ */
+void _armv7_neon_probe(void);
+unsigned int _armv7_tick(void);
+
+unsigned int OPENSSL_rdtsc(void)
+	{
+	if (OPENSSL_armcap_P|ARMV7_TICK)
+		return _armv7_tick();
+	else
+		return 0;
+	}
+
+#if defined(__GNUC__) && __GNUC__>=2
+void OPENSSL_cpuid_setup(void) __attribute__((constructor));
+#endif
+void OPENSSL_cpuid_setup(void)
+	{
+	char *e;
+	struct sigaction	ill_oact,ill_act;
+	sigset_t		oset;
+	static int trigger=0;
+
+	if (trigger) return;
+	trigger=1;
+ 
+	if ((e=getenv("OPENSSL_armcap")))
+		{
+		OPENSSL_armcap_P=strtoul(e,NULL,0);
+		return;
+		}
+
+	sigfillset(&all_masked);
+	sigdelset(&all_masked,SIGILL);
+	sigdelset(&all_masked,SIGTRAP);
+	sigdelset(&all_masked,SIGFPE);
+	sigdelset(&all_masked,SIGBUS);
+	sigdelset(&all_masked,SIGSEGV);
+
+	OPENSSL_armcap_P = 0;
+
+	memset(&ill_act,0,sizeof(ill_act));
+	ill_act.sa_handler = ill_handler;
+	ill_act.sa_mask    = all_masked;
+
+	sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
+	sigaction(SIGILL,&ill_act,&ill_oact);
+
+	if (sigsetjmp(ill_jmp,1) == 0)
+		{
+		_armv7_neon_probe();
+		OPENSSL_armcap_P |= ARMV7_NEON;
+		}
+	if (sigsetjmp(ill_jmp,1) == 0)
+		{
+		_armv7_tick();
+		OPENSSL_armcap_P |= ARMV7_TICK;
+		}
+
+	sigaction (SIGILL,&ill_oact,NULL);
+	sigprocmask(SIG_SETMASK,&oset,NULL);
+	}
diff --git a/src/lib/libcrypto/armv4cpuid.S b/src/lib/libcrypto/armv4cpuid.S
new file mode 100644
index 0000000000..2d618deaa4
--- /dev/null
+++ b/src/lib/libcrypto/armv4cpuid.S
@@ -0,0 +1,154 @@
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.align	5
+.global	_armv7_neon_probe
+.type	_armv7_neon_probe,%function
+_armv7_neon_probe:
+	.word	0xf26ee1fe	@ vorr	q15,q15,q15
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_neon_probe,.-_armv7_neon_probe
+
+.global	_armv7_tick
+.type	_armv7_tick,%function
+_armv7_tick:
+	mrc	p15,0,r0,c9,c13,0
+	.word	0xe12fff1e	@ bx	lr
+.size	_armv7_tick,.-_armv7_tick
+
+.global	OPENSSL_atomic_add
+.type	OPENSSL_atomic_add,%function
+OPENSSL_atomic_add:
+#if __ARM_ARCH__>=6
+.Ladd:	ldrex	r2,[r0]
+	add	r3,r2,r1
+	strex	r2,r3,[r0]
+	cmp	r2,#0
+	bne	.Ladd
+	mov	r0,r3
+	.word	0xe12fff1e	@ bx	lr
+#else
+	stmdb	sp!,{r4-r6,lr}
+	ldr	r2,.Lspinlock
+	adr	r3,.Lspinlock
+	mov	r4,r0
+	mov	r5,r1
+	add	r6,r3,r2	@ &spinlock
+	b	.+8
+.Lspin:	bl	sched_yield
+	mov	r0,#-1
+	swp	r0,r0,[r6]
+	cmp	r0,#0
+	bne	.Lspin
+
+	ldr	r2,[r4]
+	add	r2,r2,r5
+	str	r2,[r4]
+	str	r0,[r6]		@ release spinlock
+	ldmia	sp!,{r4-r6,lr}
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+#endif
+.size	OPENSSL_atomic_add,.-OPENSSL_atomic_add
+
+.global	OPENSSL_cleanse
+.type	OPENSSL_cleanse,%function
+OPENSSL_cleanse:
+	eor	ip,ip,ip
+	cmp	r1,#7
+	subhs	r1,r1,#4
+	bhs	.Lot
+	cmp	r1,#0
+	beq	.Lcleanse_done
+.Little:
+	strb	ip,[r0],#1
+	subs	r1,r1,#1
+	bhi	.Little
+	b	.Lcleanse_done
+
+.Lot:	tst	r0,#3
+	beq	.Laligned
+	strb	ip,[r0],#1
+	sub	r1,r1,#1
+	b	.Lot
+.Laligned:
+	str	ip,[r0],#4
+	subs	r1,r1,#4
+	bhs	.Laligned
+	adds	r1,r1,#4
+	bne	.Little
+.Lcleanse_done:
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_cleanse,.-OPENSSL_cleanse
+
+.global	OPENSSL_wipe_cpu
+.type	OPENSSL_wipe_cpu,%function
+OPENSSL_wipe_cpu:
+	ldr	r0,.LOPENSSL_armcap
+	adr	r1,.LOPENSSL_armcap
+	ldr	r0,[r1,r0]
+	eor	r2,r2,r2
+	eor	r3,r3,r3
+	eor	ip,ip,ip
+	tst	r0,#1
+	beq	.Lwipe_done
+	.word	0xf3000150	@ veor    q0, q0, q0
+	.word	0xf3022152	@ veor    q1, q1, q1
+	.word	0xf3044154	@ veor    q2, q2, q2
+	.word	0xf3066156	@ veor    q3, q3, q3
+	.word	0xf34001f0	@ veor    q8, q8, q8
+	.word	0xf34221f2	@ veor    q9, q9, q9
+	.word	0xf34441f4	@ veor    q10, q10, q10
+	.word	0xf34661f6	@ veor    q11, q11, q11
+	.word	0xf34881f8	@ veor    q12, q12, q12
+	.word	0xf34aa1fa	@ veor    q13, q13, q13
+	.word	0xf34cc1fc	@ veor    q14, q14, q14
+	.word	0xf34ee1fe	@ veor    q15, q15, q15
+.Lwipe_done:
+	mov	r0,sp
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
+
+.global	OPENSSL_instrument_bus
+.type	OPENSSL_instrument_bus,%function
+OPENSSL_instrument_bus:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
+
+.global	OPENSSL_instrument_bus2
+.type	OPENSSL_instrument_bus2,%function
+OPENSSL_instrument_bus2:
+	eor	r0,r0,r0
+	tst	lr,#1
+	moveq	pc,lr
+	.word	0xe12fff1e	@ bx	lr
+.size	OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
+
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-.LOPENSSL_armcap
+#if __ARM_ARCH__>=6
+.align	5
+#else
+.Lspinlock:
+.word	atomic_add_spinlock-.Lspinlock
+.align	5
+
+.data
+.align	2
+atomic_add_spinlock:
+.word	0
+#endif
+
+.comm	OPENSSL_armcap_P,4,4
+.hidden	OPENSSL_armcap_P
diff --git a/src/lib/libcrypto/asn1/a_d2i_fp.c b/src/lib/libcrypto/asn1/a_d2i_fp.c
index ece40bc4c0..52b2ebdb63 100644
--- a/src/lib/libcrypto/asn1/a_d2i_fp.c
+++ b/src/lib/libcrypto/asn1/a_d2i_fp.c
@@ -57,6 +57,7 @@
  */
 
 #include <stdio.h>
+#include <limits.h>
 #include "cryptlib.h"
 #include <openssl/buffer.h>
 #include <openssl/asn1_mac.h>
@@ -143,17 +144,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 	BUF_MEM *b;
 	unsigned char *p;
 	int i;
-	int ret=-1;
 	ASN1_const_CTX c;
-	int want=HEADER_SIZE;
+	size_t want=HEADER_SIZE;
 	int eos=0;
-#if defined(__GNUC__) && defined(__ia64)
-	/* pathetic compiler bug in all known versions as of Nov. 2002 */
-	long off=0;
-#else
-	int off=0;
-#endif
-	int len=0;
+	size_t off=0;
+	size_t len=0;
 
 	b=BUF_MEM_new();
 	if (b == NULL)
@@ -169,7 +164,7 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			{
 			want-=(len-off);
 
-			if (!BUF_MEM_grow_clean(b,len+want))
+			if (len + want < len || !BUF_MEM_grow_clean(b,len+want))
 				{
 				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
 				goto err;
@@ -181,7 +176,14 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 				goto err;
 				}
 			if (i > 0)
+				{
+				if (len+i < len)
+					{
+					ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+					goto err;
+					}
 				len+=i;
+				}
 			}
 		/* else data already loaded */
 
@@ -206,6 +208,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			{
 			/* no data body so go round again */
 			eos++;
+			if (eos < 0)
+				{
+				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_HEADER_TOO_LONG);
+				goto err;
+				}
 			want=HEADER_SIZE;
 			}
 		else if (eos && (c.slen == 0) && (c.tag == V_ASN1_EOC))
@@ -220,10 +227,16 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 		else 
 			{
 			/* suck in c.slen bytes of data */
-			want=(int)c.slen;
+			want=c.slen;
 			if (want > (len-off))
 				{
 				want-=(len-off);
+				if (want > INT_MAX /* BIO_read takes an int length */ ||
+					len+want < len)
+						{
+						ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+						goto err;
+						}
 				if (!BUF_MEM_grow_clean(b,len+want))
 					{
 					ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE);
@@ -238,11 +251,18 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 						    ASN1_R_NOT_ENOUGH_DATA);
 						goto err;
 						}
+					/* This can't overflow because
+					 * |len+want| didn't overflow. */
 					len+=i;
-					want -= i;
+					want-=i;
 					}
 				}
-			off+=(int)c.slen;
+			if (off + c.slen < off)
+				{
+				ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+				goto err;
+				}
+			off+=c.slen;
 			if (eos <= 0)
 				{
 				break;
@@ -252,9 +272,15 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb)
 			}
 		}
 
+	if (off > INT_MAX)
+		{
+		ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG);
+		goto err;
+		}
+
 	*pb = b;
 	return off;
 err:
 	if (b != NULL) BUF_MEM_free(b);
-	return(ret);
+	return -1;
 	}
diff --git a/src/lib/libcrypto/asn1/a_digest.c b/src/lib/libcrypto/asn1/a_digest.c
index d00d9e22b1..cbdeea6ac0 100644
--- a/src/lib/libcrypto/asn1/a_digest.c
+++ b/src/lib/libcrypto/asn1/a_digest.c
@@ -87,7 +87,8 @@ int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data,
 	p=str;
 	i2d(data,&p);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}
@@ -104,7 +105,8 @@ int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn,
 	i=ASN1_item_i2d(asn,&str, it);
 	if (!str) return(0);
 
-	EVP_Digest(str, i, md, len, type, NULL);
+	if (!EVP_Digest(str, i, md, len, type, NULL))
+		return 0;
 	OPENSSL_free(str);
 	return(1);
 	}
diff --git a/src/lib/libcrypto/asn1/a_int.c b/src/lib/libcrypto/asn1/a_int.c
index 3348b8762c..ad0d2506f6 100644
--- a/src/lib/libcrypto/asn1/a_int.c
+++ b/src/lib/libcrypto/asn1/a_int.c
@@ -386,8 +386,8 @@ long ASN1_INTEGER_get(const ASN1_INTEGER *a)
 	
 	if (a->length > (int)sizeof(long))
 		{
-		/* hmm... a bit ugly */
-		return(0xffffffffL);
+		/* hmm... a bit ugly, return all ones */
+		return -1;
 		}
 	if (a->data == NULL)
 		return 0;
diff --git a/src/lib/libcrypto/asn1/a_sign.c b/src/lib/libcrypto/asn1/a_sign.c
index ff63bfc7be..7b4a193d6b 100644
--- a/src/lib/libcrypto/asn1/a_sign.c
+++ b/src/lib/libcrypto/asn1/a_sign.c
@@ -184,9 +184,9 @@ int ASN1_sign(i2d_of_void *i2d, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
+	if (!EVP_SignInit_ex(&ctx,type, NULL)
+		|| !EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl)
+		|| !EVP_SignFinal(&ctx,(unsigned char *)buf_out,
 			(unsigned int *)&outl,pkey))
 		{
 		outl=0;
@@ -218,65 +218,100 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	     const EVP_MD *type)
 	{
 	EVP_MD_CTX ctx;
+	EVP_MD_CTX_init(&ctx);
+	if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey))
+		{
+		EVP_MD_CTX_cleanup(&ctx);
+		return 0;
+		}
+	return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx);
+	}
+		
+
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx)
+	{
+	const EVP_MD *type;
+	EVP_PKEY *pkey;
 	unsigned char *buf_in=NULL,*buf_out=NULL;
-	int inl=0,outl=0,outll=0;
+	size_t inl=0,outl=0,outll=0;
 	int signid, paramtype;
+	int rv;
+
+	type = EVP_MD_CTX_md(ctx);
+	pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx);
 
-	if (type == NULL)
+	if (!type || !pkey)
 		{
-		int def_nid;
-		if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0)
-			type = EVP_get_digestbynid(def_nid);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED);
+		return 0;
 		}
 
-	if (type == NULL)
+	if (pkey->ameth->item_sign)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN, ASN1_R_NO_DEFAULT_DIGEST);
-		return 0;
+		rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2,
+						signature);
+		if (rv == 1)
+			outl = signature->length;
+		/* Return value meanings:
+		 * <=0: error.
+		 *   1: method does everything.
+		 *   2: carry on as normal.
+		 *   3: ASN1 method sets algorithm identifiers: just sign.
+		 */
+		if (rv <= 0)
+			ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB);
+		if (rv <= 1)
+			goto err;
 		}
+	else
+		rv = 2;
 
-	if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
+	if (rv == 2)
 		{
-		if (!pkey->ameth ||
-			!OBJ_find_sigid_by_algs(&signid, EVP_MD_nid(type),
-						pkey->ameth->pkey_id))
+		if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 			{
-			ASN1err(ASN1_F_ASN1_ITEM_SIGN,
-				ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
-			return 0;
+			if (!pkey->ameth ||
+				!OBJ_find_sigid_by_algs(&signid,
+							EVP_MD_nid(type),
+							pkey->ameth->pkey_id))
+				{
+				ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,
+					ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED);
+				return 0;
+				}
 			}
-		}
-	else
-		signid = type->pkey_type;
+		else
+			signid = type->pkey_type;
 
-	if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
-		paramtype = V_ASN1_NULL;
-	else
-		paramtype = V_ASN1_UNDEF;
+		if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL)
+			paramtype = V_ASN1_NULL;
+		else
+			paramtype = V_ASN1_UNDEF;
 
-	if (algor1)
-		X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
-	if (algor2)
-		X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+		if (algor1)
+			X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL);
+		if (algor2)
+			X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL);
+
+		}
 
-	EVP_MD_CTX_init(&ctx);
 	inl=ASN1_item_i2d(asn,&buf_in, it);
 	outll=outl=EVP_PKEY_size(pkey);
-	buf_out=(unsigned char *)OPENSSL_malloc((unsigned int)outl);
+	buf_out=OPENSSL_malloc((unsigned int)outl);
 	if ((buf_in == NULL) || (buf_out == NULL))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_MALLOC_FAILURE);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
 
-	EVP_SignInit_ex(&ctx,type, NULL);
-	EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl);
-	if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out,
-			(unsigned int *)&outl,pkey))
+	if (!EVP_DigestSignUpdate(ctx, buf_in, inl)
+		|| !EVP_DigestSignFinal(ctx, buf_out, &outl))
 		{
 		outl=0;
-		ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_EVP_LIB);
+		ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_EVP_LIB);
 		goto err;
 		}
 	if (signature->data != NULL) OPENSSL_free(signature->data);
@@ -289,7 +324,7 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	signature->flags&= ~(ASN1_STRING_FLAG_BITS_LEFT|0x07);
 	signature->flags|=ASN1_STRING_FLAG_BITS_LEFT;
 err:
-	EVP_MD_CTX_cleanup(&ctx);
+	EVP_MD_CTX_cleanup(ctx);
 	if (buf_in != NULL)
 		{ OPENSSL_cleanse((char *)buf_in,(unsigned int)inl); OPENSSL_free(buf_in); }
 	if (buf_out != NULL)
diff --git a/src/lib/libcrypto/asn1/a_verify.c b/src/lib/libcrypto/asn1/a_verify.c
index cecdb13c70..432722e409 100644
--- a/src/lib/libcrypto/asn1/a_verify.c
+++ b/src/lib/libcrypto/asn1/a_verify.c
@@ -101,8 +101,13 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature,
 	p=buf_in;
 
 	i2d(data,&p);
-	EVP_VerifyInit_ex(&ctx,type, NULL);
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_VerifyInit_ex(&ctx,type, NULL)
+		|| !EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
@@ -126,11 +131,10 @@ err:
 #endif
 
 
-int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signature,
-	     void *asn, EVP_PKEY *pkey)
+int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a,
+		ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey)
 	{
 	EVP_MD_CTX ctx;
-	const EVP_MD *type = NULL;
 	unsigned char *buf_in=NULL;
 	int ret= -1,inl;
 
@@ -144,25 +148,47 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
 		goto err;
 		}
-	type=EVP_get_digestbynid(mdnid);
-	if (type == NULL)
+	if (mdnid == NID_undef)
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
-		goto err;
+		if (!pkey->ameth || !pkey->ameth->item_verify)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM);
+			goto err;
+			}
+		ret = pkey->ameth->item_verify(&ctx, it, asn, a,
+							signature, pkey);
+		/* Return value of 2 means carry on, anything else means we
+		 * exit straight away: either a fatal error of the underlying
+		 * verification routine handles all verification.
+		 */
+		if (ret != 2)
+			goto err;
+		ret = -1;
 		}
-
-	/* Check public key OID matches public key type */
-	if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+	else
 		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
-		goto err;
-		}
+		const EVP_MD *type;
+		type=EVP_get_digestbynid(mdnid);
+		if (type == NULL)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM);
+			goto err;
+			}
+
+		/* Check public key OID matches public key type */
+		if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id)
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE);
+			goto err;
+			}
+
+		if (!EVP_DigestVerifyInit(&ctx, NULL, type, NULL, pkey))
+			{
+			ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+			ret=0;
+			goto err;
+			}
 
-	if (!EVP_VerifyInit_ex(&ctx,type, NULL))
-		{
-		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
-		ret=0;
-		goto err;
 		}
 
 	inl = ASN1_item_i2d(asn, &buf_in, it);
@@ -173,13 +199,18 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat
 		goto err;
 		}
 
-	EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl);
+	if (!EVP_DigestVerifyUpdate(&ctx,buf_in,inl))
+		{
+		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
+		ret=0;
+		goto err;
+		}
 
 	OPENSSL_cleanse(buf_in,(unsigned int)inl);
 	OPENSSL_free(buf_in);
 
-	if (EVP_VerifyFinal(&ctx,(unsigned char *)signature->data,
-			(unsigned int)signature->length,pkey) <= 0)
+	if (EVP_DigestVerifyFinal(&ctx,signature->data,
+			(size_t)signature->length) <= 0)
 		{
 		ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB);
 		ret=0;
diff --git a/src/lib/libcrypto/asn1/ameth_lib.c b/src/lib/libcrypto/asn1/ameth_lib.c
index 5a581b90ea..a19e058fca 100644
--- a/src/lib/libcrypto/asn1/ameth_lib.c
+++ b/src/lib/libcrypto/asn1/ameth_lib.c
@@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[];
 extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth;
 extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth;
+extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth;
 
 /* Keep this sorted in type order !! */
 static const EVP_PKEY_ASN1_METHOD *standard_methods[] = 
@@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] =
 #ifndef OPENSSL_NO_EC
 	&eckey_asn1_meth,
 #endif
-	&hmac_asn1_meth
+	&hmac_asn1_meth,
+	&cmac_asn1_meth
 	};
 
 typedef int sk_cmp_fn_type(const char * const *a, const char * const *b);
@@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
 	if (!ameth)
 		return NULL;
 
+	memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD));
+
 	ameth->pkey_id = id;
 	ameth->pkey_base_id = id;
 	ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC;
@@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags,
 	ameth->old_priv_encode = 0;
 	ameth->old_priv_decode = 0;
 
+	ameth->item_verify = 0;
+	ameth->item_sign = 0;
+
 	ameth->pkey_size = 0;
 	ameth->pkey_bits = 0;
 
@@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst,
 	dst->pkey_free = src->pkey_free;
 	dst->pkey_ctrl = src->pkey_ctrl;
 
+	dst->item_sign = src->item_sign;
+	dst->item_verify = src->item_verify;
+
 	}
 
 void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth)
diff --git a/src/lib/libcrypto/asn1/asn1.h b/src/lib/libcrypto/asn1/asn1.h
index 59540e4e79..220a0c8c63 100644
--- a/src/lib/libcrypto/asn1/asn1.h
+++ b/src/lib/libcrypto/asn1/asn1.h
@@ -235,7 +235,7 @@ typedef struct asn1_object_st
  */
 #define ASN1_STRING_FLAG_MSTRING 0x040 
 /* This is the base type that holds just about everything :-) */
-typedef struct asn1_string_st
+struct asn1_string_st
 	{
 	int length;
 	int type;
@@ -245,7 +245,7 @@ typedef struct asn1_string_st
 	 * input data has a non-zero 'unused bits' value, it will be
 	 * handled correctly */
 	long flags;
-	} ASN1_STRING;
+	};
 
 /* ASN1_ENCODING structure: this is used to save the received
  * encoding of an ASN1 type. This is useful to get round
@@ -293,7 +293,6 @@ DECLARE_STACK_OF(ASN1_STRING_TABLE)
  * see asn1t.h
  */
 typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE;
-typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct ASN1_TLC_st ASN1_TLC;
 /* This is just an opaque pointer */
 typedef struct ASN1_VALUE_st ASN1_VALUE;
@@ -1194,6 +1193,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_ASN1_ITEM_I2D_FP				 193
 #define ASN1_F_ASN1_ITEM_PACK				 198
 #define ASN1_F_ASN1_ITEM_SIGN				 195
+#define ASN1_F_ASN1_ITEM_SIGN_CTX			 220
 #define ASN1_F_ASN1_ITEM_UNPACK				 199
 #define ASN1_F_ASN1_ITEM_VERIFY				 197
 #define ASN1_F_ASN1_MBSTRING_NCOPY			 122
@@ -1266,6 +1266,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_F_PKCS5_PBE2_SET_IV			 167
 #define ASN1_F_PKCS5_PBE_SET				 202
 #define ASN1_F_PKCS5_PBE_SET0_ALGOR			 215
+#define ASN1_F_PKCS5_PBKDF2_SET				 219
 #define ASN1_F_SMIME_READ_ASN1				 212
 #define ASN1_F_SMIME_TEXT				 213
 #define ASN1_F_X509_CINF_NEW				 168
@@ -1291,6 +1292,7 @@ void ERR_load_ASN1_strings(void);
 #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH			 106
 #define ASN1_R_BUFFER_TOO_SMALL				 107
 #define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER		 108
+#define ASN1_R_CONTEXT_NOT_INITIALISED			 217
 #define ASN1_R_DATA_IS_WRONG				 109
 #define ASN1_R_DECODE_ERROR				 110
 #define ASN1_R_DECODING_ERROR				 111
diff --git a/src/lib/libcrypto/asn1/asn1_err.c b/src/lib/libcrypto/asn1/asn1_err.c
index 6e04d08f31..1a30bf119b 100644
--- a/src/lib/libcrypto/asn1/asn1_err.c
+++ b/src/lib/libcrypto/asn1/asn1_err.c
@@ -1,6 +1,6 @@
 /* crypto/asn1/asn1_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -107,6 +107,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
 {ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP),	"ASN1_item_i2d_fp"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_PACK),	"ASN1_item_pack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN),	"ASN1_item_sign"},
+{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX),	"ASN1_item_sign_ctx"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK),	"ASN1_item_unpack"},
 {ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY),	"ASN1_item_verify"},
 {ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY),	"ASN1_mbstring_ncopy"},
@@ -179,6 +180,7 @@ static ERR_STRING_DATA ASN1_str_functs[]=
 {ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV),	"PKCS5_pbe2_set_iv"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET),	"PKCS5_pbe_set"},
 {ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR),	"PKCS5_pbe_set0_algor"},
+{ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET),	"PKCS5_pbkdf2_set"},
 {ERR_FUNC(ASN1_F_SMIME_READ_ASN1),	"SMIME_read_ASN1"},
 {ERR_FUNC(ASN1_F_SMIME_TEXT),	"SMIME_text"},
 {ERR_FUNC(ASN1_F_X509_CINF_NEW),	"X509_CINF_NEW"},
@@ -207,6 +209,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]=
 {ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"},
 {ERR_REASON(ASN1_R_BUFFER_TOO_SMALL)     ,"buffer too small"},
 {ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),"cipher has no object identifier"},
+{ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED),"context not initialised"},
 {ERR_REASON(ASN1_R_DATA_IS_WRONG)        ,"data is wrong"},
 {ERR_REASON(ASN1_R_DECODE_ERROR)         ,"decode error"},
 {ERR_REASON(ASN1_R_DECODING_ERROR)       ,"decoding error"},
diff --git a/src/lib/libcrypto/asn1/asn1_locl.h b/src/lib/libcrypto/asn1/asn1_locl.h
index 5aa65e28f5..9fcf0d9530 100644
--- a/src/lib/libcrypto/asn1/asn1_locl.h
+++ b/src/lib/libcrypto/asn1/asn1_locl.h
@@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st
 	int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b);
 	int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent,
 							ASN1_PCTX *pctx);
+	int (*sig_print)(BIO *out,
+			 const X509_ALGOR *sigalg, const ASN1_STRING *sig,
+					 int indent, ASN1_PCTX *pctx);
+
 
 	void (*pkey_free)(EVP_PKEY *pkey);
 	int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2);
@@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st
 	int (*old_priv_decode)(EVP_PKEY *pkey,
 				const unsigned char **pder, int derlen);
 	int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder);
+	/* Custom ASN1 signature verification */
+	int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *a, ASN1_BIT_STRING *sig,
+				EVP_PKEY *pkey);
+	int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig);
 
 	} /* EVP_PKEY_ASN1_METHOD */;
 
diff --git a/src/lib/libcrypto/asn1/asn_mime.c b/src/lib/libcrypto/asn1/asn_mime.c
index c1d1b12291..54a704a969 100644
--- a/src/lib/libcrypto/asn1/asn_mime.c
+++ b/src/lib/libcrypto/asn1/asn_mime.c
@@ -377,8 +377,12 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 	BIO *tmpbio;
 	const ASN1_AUX *aux = it->funcs;
 	ASN1_STREAM_ARG sarg;
+	int rv = 1;
 
-	if (!(flags & SMIME_DETACHED))
+	/* If data is not deteched or resigning then the output BIO is
+	 * already set up to finalise when it is written through.
+	 */
+	if (!(flags & SMIME_DETACHED) || (flags & PKCS7_REUSE_DIGEST))
 		{
 		SMIME_crlf_copy(data, out, flags);
 		return 1;
@@ -405,7 +409,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 
 	/* Finalize structure */
 	if (aux->asn1_cb(ASN1_OP_DETACHED_POST, &val, it, &sarg) <= 0)
-		return 0;
+		rv = 0;
 
 	/* Now remove any digests prepended to the BIO */
 
@@ -416,7 +420,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags,
 		sarg.ndef_bio = tmpbio;
 		}
 
-	return 1;
+	return rv;
 
 	}
 
@@ -486,9 +490,9 @@ ASN1_VALUE *SMIME_read_ASN1(BIO *bio, BIO **bcont, const ASN1_ITEM *it)
 
 		if(strcmp(hdr->value, "application/x-pkcs7-signature") &&
 			strcmp(hdr->value, "application/pkcs7-signature")) {
-			sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
 			ASN1err(ASN1_F_SMIME_READ_ASN1,ASN1_R_SIG_INVALID_MIME_TYPE);
 			ERR_add_error_data(2, "type: ", hdr->value);
+			sk_MIME_HEADER_pop_free(headers, mime_hdr_free);
 			sk_BIO_pop_free(parts, BIO_vfree);
 			return NULL;
 		}
@@ -801,7 +805,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
 	if(name) {
 		if(!(tmpname = BUF_strdup(name))) return NULL;
 		for(p = tmpname ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -811,7 +815,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value)
 	if(value) {
 		if(!(tmpval = BUF_strdup(value))) return NULL;
 		for(p = tmpval ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -835,7 +839,7 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
 		tmpname = BUF_strdup(name);
 		if(!tmpname) return 0;
 		for(p = tmpname ; *p; p++) {
-			c = *p;
+			c = (unsigned char)*p;
 			if(isupper(c)) {
 				c = tolower(c);
 				*p = c;
@@ -858,12 +862,17 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value)
 static int mime_hdr_cmp(const MIME_HEADER * const *a,
 			const MIME_HEADER * const *b)
 {
+	if (!(*a)->name || !(*b)->name)
+		return !!(*a)->name - !!(*b)->name;
+
 	return(strcmp((*a)->name, (*b)->name));
 }
 
 static int mime_param_cmp(const MIME_PARAM * const *a,
 			const MIME_PARAM * const *b)
 {
+	if (!(*a)->param_name || !(*b)->param_name)
+		return !!(*a)->param_name - !!(*b)->param_name;
 	return(strcmp((*a)->param_name, (*b)->param_name));
 }
 
diff --git a/src/lib/libcrypto/asn1/n_pkey.c b/src/lib/libcrypto/asn1/n_pkey.c
index e7d0439062..e251739933 100644
--- a/src/lib/libcrypto/asn1/n_pkey.c
+++ b/src/lib/libcrypto/asn1/n_pkey.c
@@ -129,6 +129,7 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
 	unsigned char buf[256],*zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	if (a == NULL) return(0);
 
@@ -206,24 +207,28 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp,
 	i = strlen((char *)buf);
 	/* If the key is used for SGC the algorithm is modified a little. */
 	if(sgckey) {
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
 	/* Encrypt private key in place */
 	zz = enckey->enckey->digest->data;
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL);
-	EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen);
-	EVP_EncryptFinal_ex(&ctx,zz + i,&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL))
+		goto err;
+	if (!EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen))
+		goto err;
+	if (!EVP_EncryptFinal_ex(&ctx,zz + i,&j))
+		goto err;
 
 	ret = i2d_NETSCAPE_ENCRYPTED_PKEY(enckey, pp);
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_ENCRYPTED_PKEY_free(enckey);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
@@ -288,6 +293,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 	const unsigned char *zz;
 	unsigned char key[EVP_MAX_KEY_LENGTH];
 	EVP_CIPHER_CTX ctx;
+	EVP_CIPHER_CTX_init(&ctx);
 
 	i=cb((char *)buf,256,"Enter Private Key password:",0);
 	if (i != 0)
@@ -298,19 +304,22 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 
 	i = strlen((char *)buf);
 	if(sgckey){
-		EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL);
+		if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL))
+			goto err;
 		memcpy(buf + 16, "SGCKEYSALT", 10);
 		i = 26;
 	}
 		
-	EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL);
+	if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL))
+		goto err;
 	OPENSSL_cleanse(buf,256);
 
-	EVP_CIPHER_CTX_init(&ctx);
-	EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL);
-	EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length);
-	EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j);
-	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (!EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL))
+		goto err;
+	if (!EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length))
+		goto err;
+	if (!EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j))
+		goto err;
 	os->length=i+j;
 
 	zz=os->data;
@@ -328,6 +337,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os,
 		goto err;
 		}
 err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
 	NETSCAPE_PKEY_free(pkey);
 	return(ret);
 	}
diff --git a/src/lib/libcrypto/asn1/p5_pbev2.c b/src/lib/libcrypto/asn1/p5_pbev2.c
index cb49b6651d..4ea683036b 100644
--- a/src/lib/libcrypto/asn1/p5_pbev2.c
+++ b/src/lib/libcrypto/asn1/p5_pbev2.c
@@ -91,12 +91,10 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 				 unsigned char *aiv, int prf_nid)
 {
 	X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL;
-	int alg_nid;
+	int alg_nid, keylen;
 	EVP_CIPHER_CTX ctx;
 	unsigned char iv[EVP_MAX_IV_LENGTH];
-	PBKDF2PARAM *kdf = NULL;
 	PBE2PARAM *pbe2 = NULL;
-	ASN1_OCTET_STRING *osalt = NULL;
 	ASN1_OBJECT *obj;
 
 	alg_nid = EVP_CIPHER_type(cipher);
@@ -127,7 +125,8 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 	EVP_CIPHER_CTX_init(&ctx);
 
 	/* Dummy cipherinit to just setup the IV, and PRF */
-	EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0);
+	if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0))
+		goto err;
 	if(EVP_CIPHER_param_to_asn1(&ctx, scheme->parameter) < 0) {
 		ASN1err(ASN1_F_PKCS5_PBE2_SET_IV,
 					ASN1_R_ERROR_SETTING_CIPHER_PARAMS);
@@ -145,55 +144,21 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 		}
 	EVP_CIPHER_CTX_cleanup(&ctx);
 
-	if(!(kdf = PBKDF2PARAM_new())) goto merr;
-	if(!(osalt = M_ASN1_OCTET_STRING_new())) goto merr;
-
-	if (!saltlen) saltlen = PKCS5_SALT_LEN;
-	if (!(osalt->data = OPENSSL_malloc (saltlen))) goto merr;
-	osalt->length = saltlen;
-	if (salt) memcpy (osalt->data, salt, saltlen);
-	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) goto merr;
-
-	if(iter <= 0) iter = PKCS5_DEFAULT_ITER;
-	if(!ASN1_INTEGER_set(kdf->iter, iter)) goto merr;
-
-	/* Now include salt in kdf structure */
-	kdf->salt->value.octet_string = osalt;
-	kdf->salt->type = V_ASN1_OCTET_STRING;
-	osalt = NULL;
-
 	/* If its RC2 then we'd better setup the key length */
 
-	if(alg_nid == NID_rc2_cbc) {
-		if(!(kdf->keylength = M_ASN1_INTEGER_new())) goto merr;
-		if(!ASN1_INTEGER_set (kdf->keylength,
-				 EVP_CIPHER_key_length(cipher))) goto merr;
-	}
-
-	/* prf can stay NULL if we are using hmacWithSHA1 */
-	if (prf_nid != NID_hmacWithSHA1)
-		{
-		kdf->prf = X509_ALGOR_new();
-		if (!kdf->prf)
-			goto merr;
-		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
-					V_ASN1_NULL, NULL);
-		}
-
-	/* Now setup the PBE2PARAM keyfunc structure */
+	if(alg_nid == NID_rc2_cbc)
+		keylen = EVP_CIPHER_key_length(cipher);
+	else
+		keylen = -1;
 
-	pbe2->keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+	/* Setup keyfunc */
 
-	/* Encode PBKDF2PARAM into parameter of pbe2 */
+	X509_ALGOR_free(pbe2->keyfunc);
 
-	if(!(pbe2->keyfunc->parameter = ASN1_TYPE_new())) goto merr;
+	pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen);
 
-	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
-			 &pbe2->keyfunc->parameter->value.sequence)) goto merr;
-	pbe2->keyfunc->parameter->type = V_ASN1_SEQUENCE;
-
-	PBKDF2PARAM_free(kdf);
-	kdf = NULL;
+	if (!pbe2->keyfunc)
+		goto merr;
 
 	/* Now set up top level AlgorithmIdentifier */
 
@@ -219,8 +184,6 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 	err:
 	PBE2PARAM_free(pbe2);
 	/* Note 'scheme' is freed as part of pbe2 */
-	M_ASN1_OCTET_STRING_free(osalt);
-	PBKDF2PARAM_free(kdf);
 	X509_ALGOR_free(kalg);
 	X509_ALGOR_free(ret);
 
@@ -233,3 +196,85 @@ X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter,
 	{
 	return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1);
 	}
+
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen)
+	{
+	X509_ALGOR *keyfunc = NULL;
+	PBKDF2PARAM *kdf = NULL;
+	ASN1_OCTET_STRING *osalt = NULL;
+
+	if(!(kdf = PBKDF2PARAM_new()))
+		goto merr;
+	if(!(osalt = M_ASN1_OCTET_STRING_new()))
+		goto merr;
+
+	kdf->salt->value.octet_string = osalt;
+	kdf->salt->type = V_ASN1_OCTET_STRING;
+
+	if (!saltlen)
+		saltlen = PKCS5_SALT_LEN;
+	if (!(osalt->data = OPENSSL_malloc (saltlen)))
+		goto merr;
+
+	osalt->length = saltlen;
+
+	if (salt)
+		memcpy (osalt->data, salt, saltlen);
+	else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0)
+		goto merr;
+
+	if(iter <= 0)
+		iter = PKCS5_DEFAULT_ITER;
+
+	if(!ASN1_INTEGER_set(kdf->iter, iter))
+		goto merr;
+
+	/* If have a key len set it up */
+
+	if(keylen > 0) 
+		{
+		if(!(kdf->keylength = M_ASN1_INTEGER_new()))
+			goto merr;
+		if(!ASN1_INTEGER_set (kdf->keylength, keylen))
+			goto merr;
+		}
+
+	/* prf can stay NULL if we are using hmacWithSHA1 */
+	if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1)
+		{
+		kdf->prf = X509_ALGOR_new();
+		if (!kdf->prf)
+			goto merr;
+		X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid),
+					V_ASN1_NULL, NULL);
+		}
+
+	/* Finally setup the keyfunc structure */
+
+	keyfunc = X509_ALGOR_new();
+	if (!keyfunc)
+		goto merr;
+
+	keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2);
+
+	/* Encode PBKDF2PARAM into parameter of pbe2 */
+
+	if(!(keyfunc->parameter = ASN1_TYPE_new()))
+		goto merr;
+
+	if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM),
+			 &keyfunc->parameter->value.sequence))
+		goto merr;
+	keyfunc->parameter->type = V_ASN1_SEQUENCE;
+
+	PBKDF2PARAM_free(kdf);
+	return keyfunc;
+
+	merr:
+	ASN1err(ASN1_F_PKCS5_PBKDF2_SET,ERR_R_MALLOC_FAILURE);
+	PBKDF2PARAM_free(kdf);
+	X509_ALGOR_free(keyfunc);
+	return NULL;
+	}
+
diff --git a/src/lib/libcrypto/asn1/t_crl.c b/src/lib/libcrypto/asn1/t_crl.c
index ee5a687ce8..c61169208a 100644
--- a/src/lib/libcrypto/asn1/t_crl.c
+++ b/src/lib/libcrypto/asn1/t_crl.c
@@ -94,8 +94,7 @@ int X509_CRL_print(BIO *out, X509_CRL *x)
 	l = X509_CRL_get_version(x);
 	BIO_printf(out, "%8sVersion %lu (0x%lx)\n", "", l+1, l);
 	i = OBJ_obj2nid(x->sig_alg->algorithm);
-	BIO_printf(out, "%8sSignature Algorithm: %s\n", "",
-				 (i == NID_undef) ? "NONE" : OBJ_nid2ln(i));
+	X509_signature_print(out, x->sig_alg, NULL);
 	p=X509_NAME_oneline(X509_CRL_get_issuer(x),NULL,0);
 	BIO_printf(out,"%8sIssuer: %s\n","",p);
 	OPENSSL_free(p);
diff --git a/src/lib/libcrypto/asn1/t_x509.c b/src/lib/libcrypto/asn1/t_x509.c
index e061f2ffad..edbb39a02f 100644
--- a/src/lib/libcrypto/asn1/t_x509.c
+++ b/src/lib/libcrypto/asn1/t_x509.c
@@ -72,6 +72,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/x509v3.h>
+#include "asn1_locl.h"
 
 #ifndef OPENSSL_NO_FP_API
 int X509_print_fp(FILE *fp, X509 *x)
@@ -137,10 +138,10 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
 		if (BIO_write(bp,"        Serial Number:",22) <= 0) goto err;
 
 		bs=X509_get_serialNumber(x);
-		if (bs->length <= 4)
+		if (bs->length <= (int)sizeof(long))
 			{
 			l=ASN1_INTEGER_get(bs);
-			if (l < 0)
+			if (bs->type == V_ASN1_NEG_INTEGER)
 				{
 				l= -l;
 				neg="-";
@@ -167,12 +168,16 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag)
 
 	if(!(cflag & X509_FLAG_NO_SIGNAME))
 		{
+		if(X509_signature_print(bp, x->sig_alg, NULL) <= 0)
+			goto err;
+#if 0
 		if (BIO_printf(bp,"%8sSignature Algorithm: ","") <= 0) 
 			goto err;
 		if (i2a_ASN1_OBJECT(bp, ci->signature->algorithm) <= 0)
 			goto err;
 		if (BIO_puts(bp, "\n") <= 0)
 			goto err;
+#endif
 		}
 
 	if(!(cflag & X509_FLAG_NO_ISSUER))
@@ -255,7 +260,8 @@ int X509_ocspid_print (BIO *bp, X509 *x)
 		goto err;
 	i2d_X509_NAME(x->cert_info->subject, &dertmp);
 
-	EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) goto err;
@@ -268,8 +274,10 @@ int X509_ocspid_print (BIO *bp, X509 *x)
 	if (BIO_printf(bp,"\n        Public key OCSP hash: ") <= 0)
 		goto err;
 
-	EVP_Digest(x->cert_info->key->public_key->data,
-		x->cert_info->key->public_key->length, SHA1md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->cert_info->key->public_key->data,
+			x->cert_info->key->public_key->length,
+			SHA1md, NULL, EVP_sha1(), NULL))
+		goto err;
 	for (i=0; i < SHA_DIGEST_LENGTH; i++)
 		{
 		if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0)
@@ -283,23 +291,50 @@ err:
 	return(0);
 	}
 
-int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+int X509_signature_dump(BIO *bp, const ASN1_STRING *sig, int indent)
 {
-	unsigned char *s;
+	const unsigned char *s;
 	int i, n;
-	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
-	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
 
 	n=sig->length;
 	s=sig->data;
 	for (i=0; i<n; i++)
 		{
 		if ((i%18) == 0)
-			if (BIO_write(bp,"\n        ",9) <= 0) return 0;
+			{
+			if (BIO_write(bp,"\n",1) <= 0) return 0;
+			if (BIO_indent(bp, indent, indent) <= 0) return 0;
+			}
 			if (BIO_printf(bp,"%02x%s",s[i],
 				((i+1) == n)?"":":") <= 0) return 0;
 		}
 	if (BIO_write(bp,"\n",1) != 1) return 0;
+
+	return 1;
+}
+
+int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig)
+{
+	int sig_nid;
+	if (BIO_puts(bp,"    Signature Algorithm: ") <= 0) return 0;
+	if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0;
+
+	sig_nid = OBJ_obj2nid(sigalg->algorithm);
+	if (sig_nid != NID_undef)
+		{
+		int pkey_nid, dig_nid;
+		const EVP_PKEY_ASN1_METHOD *ameth;
+		if (OBJ_find_sigid_algs(sig_nid, &dig_nid, &pkey_nid))
+			{
+			ameth = EVP_PKEY_asn1_find(NULL, pkey_nid);
+			if (ameth && ameth->sig_print)
+				return ameth->sig_print(bp, sigalg, sig, 9, 0);
+			}
+		}
+	if (sig)
+		return X509_signature_dump(bp, sig, 9);
+	else if (BIO_puts(bp, "\n") <= 0)
+		return 0;
 	return 1;
 }
 
diff --git a/src/lib/libcrypto/asn1/tasn_prn.c b/src/lib/libcrypto/asn1/tasn_prn.c
index 453698012d..542a091a66 100644
--- a/src/lib/libcrypto/asn1/tasn_prn.c
+++ b/src/lib/libcrypto/asn1/tasn_prn.c
@@ -446,11 +446,11 @@ static int asn1_print_fsname(BIO *out, int indent,
 	return 1;
 	}
 
-static int asn1_print_boolean_ctx(BIO *out, const int bool,
+static int asn1_print_boolean_ctx(BIO *out, int boolval,
 							const ASN1_PCTX *pctx)
 	{
 	const char *str;
-	switch (bool)
+	switch (boolval)
 		{
 		case -1:
 		str = "BOOL ABSENT";
@@ -574,10 +574,10 @@ static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld,
 		{
 		case V_ASN1_BOOLEAN:
 			{
-			int bool = *(int *)fld;
-			if (bool == -1)
-				bool = it->size;
-			ret = asn1_print_boolean_ctx(out, bool, pctx);
+			int boolval = *(int *)fld;
+			if (boolval == -1)
+				boolval = it->size;
+			ret = asn1_print_boolean_ctx(out, boolval, pctx);
 			}
 		break;
 
diff --git a/src/lib/libcrypto/asn1/x_algor.c b/src/lib/libcrypto/asn1/x_algor.c
index 99e53429b7..274e456c73 100644
--- a/src/lib/libcrypto/asn1/x_algor.c
+++ b/src/lib/libcrypto/asn1/x_algor.c
@@ -128,3 +128,17 @@ void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 		}
 	}
 
+/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */
+
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md)
+	{
+	int param_type;
+
+	if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT)
+		param_type = V_ASN1_UNDEF;
+	else
+		param_type = V_ASN1_NULL;
+
+	X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL);
+
+	}
diff --git a/src/lib/libcrypto/asn1/x_name.c b/src/lib/libcrypto/asn1/x_name.c
index 49be08b4da..d7c2318693 100644
--- a/src/lib/libcrypto/asn1/x_name.c
+++ b/src/lib/libcrypto/asn1/x_name.c
@@ -399,8 +399,7 @@ static int asn1_string_canon(ASN1_STRING *out, ASN1_STRING *in)
 	/* If type not in bitmask just copy string across */
 	if (!(ASN1_tag2bit(in->type) & ASN1_MASK_CANON))
 		{
-		out->type = in->type;
-		if (!ASN1_STRING_set(out, in->data, in->length))
+		if (!ASN1_STRING_copy(out, in))
 			return 0;
 		return 1;
 		}
diff --git a/src/lib/libcrypto/asn1/x_pubkey.c b/src/lib/libcrypto/asn1/x_pubkey.c
index d42b6a2c54..627ec87f9f 100644
--- a/src/lib/libcrypto/asn1/x_pubkey.c
+++ b/src/lib/libcrypto/asn1/x_pubkey.c
@@ -171,7 +171,16 @@ EVP_PKEY *X509_PUBKEY_get(X509_PUBKEY *key)
 		goto error;
 		}
 
-	key->pkey = ret;
+	/* Check to see if another thread set key->pkey first */
+	CRYPTO_w_lock(CRYPTO_LOCK_EVP_PKEY);
+	if (key->pkey)
+		{
+		EVP_PKEY_free(ret);
+		ret = key->pkey;
+		}
+	else
+		key->pkey = ret;
+	CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY);
 	CRYPTO_add(&ret->references, 1, CRYPTO_LOCK_EVP_PKEY);
 
 	return ret;
diff --git a/src/lib/libcrypto/bf/bf_skey.c b/src/lib/libcrypto/bf/bf_skey.c
index 3673cdee6e..3b0bca41ae 100644
--- a/src/lib/libcrypto/bf/bf_skey.c
+++ b/src/lib/libcrypto/bf/bf_skey.c
@@ -58,11 +58,19 @@
 
 #include <stdio.h>
 #include <string.h>
+#include <openssl/crypto.h>
 #include <openssl/blowfish.h>
 #include "bf_locl.h"
 #include "bf_pi.h"
 
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(BLOWFISH);
+	private_BF_set_key(key, len, data);
+	}
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data)
+#endif
 	{
 	int i;
 	BF_LONG *p,ri,in[2];
diff --git a/src/lib/libcrypto/bf/blowfish.h b/src/lib/libcrypto/bf/blowfish.h
index b97e76f9a3..4b6c8920a4 100644
--- a/src/lib/libcrypto/bf/blowfish.h
+++ b/src/lib/libcrypto/bf/blowfish.h
@@ -104,7 +104,9 @@ typedef struct bf_key_st
 	BF_LONG S[4*256];
 	} BF_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data);
+#endif
 void BF_set_key(BF_KEY *key, int len, const unsigned char *data);
 
 void BF_encrypt(BF_LONG *data,const BF_KEY *key);
diff --git a/src/lib/libcrypto/bio/b_sock.c b/src/lib/libcrypto/bio/b_sock.c
index d47310d650..41f958be71 100644
--- a/src/lib/libcrypto/bio/b_sock.c
+++ b/src/lib/libcrypto/bio/b_sock.c
@@ -960,7 +960,6 @@ int BIO_set_tcp_ndelay(int s, int on)
 #endif
 	return(ret == 0);
 	}
-#endif
 
 int BIO_socket_nbio(int s, int mode)
 	{
@@ -973,3 +972,4 @@ int BIO_socket_nbio(int s, int mode)
 #endif
 	return(ret == 0);
 	}
+#endif
diff --git a/src/lib/libcrypto/bio/bio.h b/src/lib/libcrypto/bio/bio.h
index ab47abcf14..05699ab212 100644
--- a/src/lib/libcrypto/bio/bio.h
+++ b/src/lib/libcrypto/bio/bio.h
@@ -68,6 +68,14 @@
 
 #include <openssl/crypto.h>
 
+#ifndef OPENSSL_NO_SCTP
+# ifndef OPENSSL_SYS_VMS
+# include <stdint.h>
+# else
+# include <inttypes.h>
+# endif
+#endif
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -95,6 +103,9 @@ extern "C" {
 #define BIO_TYPE_BIO		(19|0x0400)		/* (half a) BIO pair */
 #define BIO_TYPE_LINEBUFFER	(20|0x0200)		/* filter */
 #define BIO_TYPE_DGRAM		(21|0x0400|0x0100)
+#ifndef OPENSSL_NO_SCTP
+#define BIO_TYPE_DGRAM_SCTP	(24|0x0400|0x0100)
+#endif
 #define BIO_TYPE_ASN1 		(22|0x0200)		/* filter */
 #define BIO_TYPE_COMP 		(23|0x0200)		/* filter */
 
@@ -146,6 +157,7 @@ extern "C" {
 /* #endif */
 
 #define BIO_CTRL_DGRAM_QUERY_MTU          40 /* as kernel for current MTU */
+#define BIO_CTRL_DGRAM_GET_FALLBACK_MTU   47
 #define BIO_CTRL_DGRAM_GET_MTU            41 /* get cached value for MTU */
 #define BIO_CTRL_DGRAM_SET_MTU            42 /* set cached value for
 					      * MTU. want to use this
@@ -161,7 +173,22 @@ extern "C" {
 #define BIO_CTRL_DGRAM_SET_PEER           44 /* Destination for the data */
 
 #define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT   45 /* Next DTLS handshake timeout to
-											  * adjust socket timeouts */
+                                              * adjust socket timeouts */
+
+#ifndef OPENSSL_NO_SCTP
+/* SCTP stuff */
+#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE	50
+#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY		51
+#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY		52
+#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD		53
+#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO		60
+#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO		61
+#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO		62
+#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO		63
+#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO			64
+#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO			65
+#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN		70
+#endif
 
 /* modifiers */
 #define BIO_FP_READ		0x02
@@ -331,6 +358,34 @@ typedef struct bio_f_buffer_ctx_struct
 /* Prefix and suffix callback in ASN1 BIO */
 typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg);
 
+#ifndef OPENSSL_NO_SCTP
+/* SCTP parameter structs */
+struct bio_dgram_sctp_sndinfo
+	{
+	uint16_t snd_sid;
+	uint16_t snd_flags;
+	uint32_t snd_ppid;
+	uint32_t snd_context;
+	};
+
+struct bio_dgram_sctp_rcvinfo
+	{
+	uint16_t rcv_sid;
+	uint16_t rcv_ssn;
+	uint16_t rcv_flags;
+	uint32_t rcv_ppid;
+	uint32_t rcv_tsn;
+	uint32_t rcv_cumtsn;
+	uint32_t rcv_context;
+	};
+
+struct bio_dgram_sctp_prinfo
+	{
+	uint16_t pr_policy;
+	uint32_t pr_value;
+	};
+#endif
+
 /* connect BIO stuff */
 #define BIO_CONN_S_BEFORE		1
 #define BIO_CONN_S_GET_IP		2
@@ -628,6 +683,9 @@ BIO_METHOD *BIO_f_linebuffer(void);
 BIO_METHOD *BIO_f_nbio_test(void);
 #ifndef OPENSSL_NO_DGRAM
 BIO_METHOD *BIO_s_datagram(void);
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void);
+#endif
 #endif
 
 /* BIO_METHOD *BIO_f_ber(void); */
@@ -670,6 +728,15 @@ int BIO_set_tcp_ndelay(int sock,int turn_on);
 
 BIO *BIO_new_socket(int sock, int close_flag);
 BIO *BIO_new_dgram(int fd, int close_flag);
+#ifndef OPENSSL_NO_SCTP
+BIO *BIO_new_dgram_sctp(int fd, int close_flag);
+int BIO_dgram_is_sctp(BIO *bio);
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context);
+int BIO_dgram_sctp_wait_for_dry(BIO *b);
+int BIO_dgram_sctp_msg_waiting(BIO *b);
+#endif
 BIO *BIO_new_fd(int fd, int close_flag);
 BIO *BIO_new_connect(char *host_port);
 BIO *BIO_new_accept(char *host_port);
@@ -734,6 +801,7 @@ void ERR_load_BIO_strings(void);
 #define BIO_F_BUFFER_CTRL				 114
 #define BIO_F_CONN_CTRL					 127
 #define BIO_F_CONN_STATE				 115
+#define BIO_F_DGRAM_SCTP_READ				 132
 #define BIO_F_FILE_CTRL					 116
 #define BIO_F_FILE_READ					 130
 #define BIO_F_LINEBUFFER_CTRL				 129
diff --git a/src/lib/libcrypto/bio/bio_err.c b/src/lib/libcrypto/bio/bio_err.c
index a224edd5a0..0dbfbd80d3 100644
--- a/src/lib/libcrypto/bio/bio_err.c
+++ b/src/lib/libcrypto/bio/bio_err.c
@@ -1,6 +1,6 @@
 /* crypto/bio/bio_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -95,6 +95,7 @@ static ERR_STRING_DATA BIO_str_functs[]=
 {ERR_FUNC(BIO_F_BUFFER_CTRL),	"BUFFER_CTRL"},
 {ERR_FUNC(BIO_F_CONN_CTRL),	"CONN_CTRL"},
 {ERR_FUNC(BIO_F_CONN_STATE),	"CONN_STATE"},
+{ERR_FUNC(BIO_F_DGRAM_SCTP_READ),	"DGRAM_SCTP_READ"},
 {ERR_FUNC(BIO_F_FILE_CTRL),	"FILE_CTRL"},
 {ERR_FUNC(BIO_F_FILE_READ),	"FILE_READ"},
 {ERR_FUNC(BIO_F_LINEBUFFER_CTRL),	"LINEBUFFER_CTRL"},
diff --git a/src/lib/libcrypto/bio/bio_lib.c b/src/lib/libcrypto/bio/bio_lib.c
index e12bc3a2ca..9c9646afa8 100644
--- a/src/lib/libcrypto/bio/bio_lib.c
+++ b/src/lib/libcrypto/bio/bio_lib.c
@@ -521,40 +521,40 @@ void BIO_free_all(BIO *bio)
 
 BIO *BIO_dup_chain(BIO *in)
 	{
-	BIO *ret=NULL,*eoc=NULL,*bio,*new;
+	BIO *ret=NULL,*eoc=NULL,*bio,*new_bio;
 
 	for (bio=in; bio != NULL; bio=bio->next_bio)
 		{
-		if ((new=BIO_new(bio->method)) == NULL) goto err;
-		new->callback=bio->callback;
-		new->cb_arg=bio->cb_arg;
-		new->init=bio->init;
-		new->shutdown=bio->shutdown;
-		new->flags=bio->flags;
+		if ((new_bio=BIO_new(bio->method)) == NULL) goto err;
+		new_bio->callback=bio->callback;
+		new_bio->cb_arg=bio->cb_arg;
+		new_bio->init=bio->init;
+		new_bio->shutdown=bio->shutdown;
+		new_bio->flags=bio->flags;
 
 		/* This will let SSL_s_sock() work with stdin/stdout */
-		new->num=bio->num;
+		new_bio->num=bio->num;
 
-		if (!BIO_dup_state(bio,(char *)new))
+		if (!BIO_dup_state(bio,(char *)new_bio))
 			{
-			BIO_free(new);
+			BIO_free(new_bio);
 			goto err;
 			}
 
 		/* copy app data */
-		if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new->ex_data,
+		if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new_bio->ex_data,
 					&bio->ex_data))
 			goto err;
 
 		if (ret == NULL)
 			{
-			eoc=new;
+			eoc=new_bio;
 			ret=eoc;
 			}
 		else
 			{
-			BIO_push(eoc,new);
-			eoc=new;
+			BIO_push(eoc,new_bio);
+			eoc=new_bio;
 			}
 		}
 	return(ret);
diff --git a/src/lib/libcrypto/bio/bss_bio.c b/src/lib/libcrypto/bio/bss_bio.c
index 76bd48e767..52ef0ebcb3 100644
--- a/src/lib/libcrypto/bio/bss_bio.c
+++ b/src/lib/libcrypto/bio/bss_bio.c
@@ -277,10 +277,10 @@ static int bio_read(BIO *bio, char *buf, int size_)
  */
 /* WARNING: The non-copying interface is largely untested as of yet
  * and may contain bugs. */
-static ssize_t bio_nread0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nread0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num;
+	ossl_ssize_t num;
 	
 	BIO_clear_retry_flags(bio);
 
@@ -315,15 +315,15 @@ static ssize_t bio_nread0(BIO *bio, char **buf)
 	return num;
 	}
 
-static ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b, *peer_b;
-	ssize_t num, available;
+	ossl_ssize_t num, available;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	available = bio_nread0(bio, buf);
 	if (num > available)
@@ -428,7 +428,7 @@ static int bio_write(BIO *bio, const char *buf, int num_)
  * (example usage:  bio_nwrite0(), write to buffer, bio_nwrite()
  *  or just         bio_nwrite(), write to buffer)
  */
-static ssize_t bio_nwrite0(BIO *bio, char **buf)
+static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf)
 	{
 	struct bio_bio_st *b;
 	size_t num;
@@ -476,15 +476,15 @@ static ssize_t bio_nwrite0(BIO *bio, char **buf)
 	return num;
 	}
 
-static ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
+static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_)
 	{
 	struct bio_bio_st *b;
-	ssize_t num, space;
+	ossl_ssize_t num, space;
 
 	if (num_ > SSIZE_MAX)
 		num = SSIZE_MAX;
 	else
-		num = (ssize_t)num_;
+		num = (ossl_ssize_t)num_;
 
 	space = bio_nwrite0(bio, buf);
 	if (num > space)
diff --git a/src/lib/libcrypto/bio/bss_dgram.c b/src/lib/libcrypto/bio/bss_dgram.c
index 71ebe987b6..1b1e4bec81 100644
--- a/src/lib/libcrypto/bio/bss_dgram.c
+++ b/src/lib/libcrypto/bio/bss_dgram.c
@@ -70,6 +70,13 @@
 #include <sys/timeb.h>
 #endif
 
+#ifndef OPENSSL_NO_SCTP
+#include <netinet/sctp.h>
+#include <fcntl.h>
+#define OPENSSL_SCTP_DATA_CHUNK_TYPE            0x00
+#define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0
+#endif
+
 #ifdef OPENSSL_SYS_LINUX
 #define IP_MTU      14 /* linux is lame */
 #endif
@@ -88,6 +95,18 @@ static int dgram_new(BIO *h);
 static int dgram_free(BIO *data);
 static int dgram_clear(BIO *bio);
 
+#ifndef OPENSSL_NO_SCTP
+static int dgram_sctp_write(BIO *h, const char *buf, int num);
+static int dgram_sctp_read(BIO *h, char *buf, int size);
+static int dgram_sctp_puts(BIO *h, const char *str);
+static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2);
+static int dgram_sctp_new(BIO *h);
+static int dgram_sctp_free(BIO *data);
+#ifdef SCTP_AUTHENTICATION_EVENT
+static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp);
+#endif
+#endif
+
 static int BIO_dgram_should_retry(int s);
 
 static void get_current_time(struct timeval *t);
@@ -106,6 +125,22 @@ static BIO_METHOD methods_dgramp=
 	NULL,
 	};
 
+#ifndef OPENSSL_NO_SCTP
+static BIO_METHOD methods_dgramp_sctp=
+	{
+	BIO_TYPE_DGRAM_SCTP,
+	"datagram sctp socket",
+	dgram_sctp_write,
+	dgram_sctp_read,
+	dgram_sctp_puts,
+	NULL, /* dgram_gets, */
+	dgram_sctp_ctrl,
+	dgram_sctp_new,
+	dgram_sctp_free,
+	NULL,
+	};
+#endif
+
 typedef struct bio_dgram_data_st
 	{
 	union {
@@ -122,6 +157,40 @@ typedef struct bio_dgram_data_st
 	struct timeval socket_timeout;
 	} bio_dgram_data;
 
+#ifndef OPENSSL_NO_SCTP
+typedef struct bio_dgram_sctp_save_message_st
+	{
+        BIO *bio;
+        char *data;
+        int length;
+	} bio_dgram_sctp_save_message;
+
+typedef struct bio_dgram_sctp_data_st
+	{
+	union {
+		struct sockaddr sa;
+		struct sockaddr_in sa_in;
+#if OPENSSL_USE_IPV6
+		struct sockaddr_in6 sa_in6;
+#endif
+	} peer;
+	unsigned int connected;
+	unsigned int _errno;
+	unsigned int mtu;
+	struct bio_dgram_sctp_sndinfo sndinfo;
+	struct bio_dgram_sctp_rcvinfo rcvinfo;
+	struct bio_dgram_sctp_prinfo prinfo;
+	void (*handle_notifications)(BIO *bio, void *context, void *buf);
+	void* notification_context;
+	int in_handshake;
+	int ccs_rcvd;
+	int ccs_sent;
+	int save_shutdown;
+	int peer_auth_tested;
+	bio_dgram_sctp_save_message saved_message;
+	} bio_dgram_sctp_data;
+#endif
+
 BIO_METHOD *BIO_s_datagram(void)
 	{
 	return(&methods_dgramp);
@@ -547,6 +616,27 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr)
 		ret = 0;
 #endif
 		break;
+	case BIO_CTRL_DGRAM_GET_FALLBACK_MTU:
+		switch (data->peer.sa.sa_family)
+			{
+			case AF_INET:
+				ret = 576 - 20 - 8;
+				break;
+#if OPENSSL_USE_IPV6
+			case AF_INET6:
+#ifdef IN6_IS_ADDR_V4MAPPED
+				if (IN6_IS_ADDR_V4MAPPED(&data->peer.sa_in6.sin6_addr))
+					ret = 576 - 20 - 8;
+				else
+#endif
+					ret = 1280 - 40 - 8;
+				break;
+#endif
+			default:
+				ret = 576 - 20 - 8;
+				break;
+			}
+		break;
 	case BIO_CTRL_DGRAM_GET_MTU:
 		return data->mtu;
 		break;
@@ -738,6 +828,912 @@ static int dgram_puts(BIO *bp, const char *str)
 	return(ret);
 	}
 
+#ifndef OPENSSL_NO_SCTP
+BIO_METHOD *BIO_s_datagram_sctp(void)
+	{
+	return(&methods_dgramp_sctp);
+	}
+
+BIO *BIO_new_dgram_sctp(int fd, int close_flag)
+	{
+	BIO *bio;
+	int ret, optval = 20000;
+	int auth_data = 0, auth_forward = 0;
+	unsigned char *p;
+	struct sctp_authchunk auth;
+	struct sctp_authchunks *authchunks;
+	socklen_t sockopt_len;
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+#endif
+#endif
+
+	bio=BIO_new(BIO_s_datagram_sctp());
+	if (bio == NULL) return(NULL);
+	BIO_set_fd(bio,fd,close_flag);
+
+	/* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */
+	auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+	auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk));
+	OPENSSL_assert(ret >= 0);
+
+	/* Test if activation was successful. When using accept(),
+	 * SCTP-AUTH has to be activated for the listening socket
+	 * already, otherwise the connected socket won't use it. */
+	sockopt_len = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+	authchunks = OPENSSL_malloc(sockopt_len);
+	memset(authchunks, 0, sizeof(sockopt_len));
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+	
+	for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+	     p < (unsigned char*) authchunks + sockopt_len;
+	     p += sizeof(uint8_t))
+		{
+		if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+		if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+		}
+		
+	OPENSSL_free(authchunks);
+
+	OPENSSL_assert(auth_data);
+	OPENSSL_assert(auth_forward);
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_AUTHENTICATION_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+	OPENSSL_assert(ret >= 0);
+#else
+	sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len);
+	OPENSSL_assert(ret >= 0);
+
+	event.sctp_authentication_event = 1;
+
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+	OPENSSL_assert(ret >= 0);
+#endif
+#endif
+
+	/* Disable partial delivery by setting the min size
+	 * larger than the max record size of 2^14 + 2048 + 13
+	 */
+	ret = setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval, sizeof(optval));
+	OPENSSL_assert(ret >= 0);
+
+	return(bio);
+	}
+
+int BIO_dgram_is_sctp(BIO *bio)
+	{
+	return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP);
+	}
+
+static int dgram_sctp_new(BIO *bi)
+	{
+	bio_dgram_sctp_data *data = NULL;
+
+	bi->init=0;
+	bi->num=0;
+	data = OPENSSL_malloc(sizeof(bio_dgram_sctp_data));
+	if (data == NULL)
+		return 0;
+	memset(data, 0x00, sizeof(bio_dgram_sctp_data));
+#ifdef SCTP_PR_SCTP_NONE
+	data->prinfo.pr_policy = SCTP_PR_SCTP_NONE;
+#endif
+    bi->ptr = data;
+
+	bi->flags=0;
+	return(1);
+	}
+
+static int dgram_sctp_free(BIO *a)
+	{
+	bio_dgram_sctp_data *data;
+
+	if (a == NULL) return(0);
+	if ( ! dgram_clear(a))
+		return 0;
+
+	data = (bio_dgram_sctp_data *)a->ptr;
+	if(data != NULL) OPENSSL_free(data);
+
+	return(1);
+	}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp)
+	{
+	unsigned int sockopt_len = 0;
+	int ret;
+	struct sctp_authkey_event* authkeyevent = &snp->sn_auth_event;
+
+	if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY)
+		{
+		struct sctp_authkeyid authkeyid;
+
+		/* delete key */
+		authkeyid.scact_keynumber = authkeyevent->auth_keynumber;
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+		      &authkeyid, sockopt_len);
+		}
+	}
+#endif
+
+static int dgram_sctp_read(BIO *b, char *out, int outl)
+	{
+	int ret = 0, n = 0, i, optval;
+	socklen_t optlen;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	union sctp_notification *snp;
+	struct msghdr msg;
+	struct iovec iov;
+	struct cmsghdr *cmsg;
+	char cmsgbuf[512];
+
+	if (out != NULL)
+		{
+		clear_socket_error();
+
+		do
+			{
+			memset(&data->rcvinfo, 0x00, sizeof(struct bio_dgram_sctp_rcvinfo));
+			iov.iov_base = out;
+			iov.iov_len = outl;
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = cmsgbuf;
+			msg.msg_controllen = 512;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (msg.msg_controllen > 0)
+				{
+				for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg))
+					{
+					if (cmsg->cmsg_level != IPPROTO_SCTP)
+						continue;
+#ifdef SCTP_RCVINFO
+					if (cmsg->cmsg_type == SCTP_RCVINFO)
+						{
+						struct sctp_rcvinfo *rcvinfo;
+
+						rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = rcvinfo->rcv_sid;
+						data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn;
+						data->rcvinfo.rcv_flags = rcvinfo->rcv_flags;
+						data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid;
+						data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn;
+						data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn;
+						data->rcvinfo.rcv_context = rcvinfo->rcv_context;
+						}
+#endif
+#ifdef SCTP_SNDRCV
+					if (cmsg->cmsg_type == SCTP_SNDRCV)
+						{
+						struct sctp_sndrcvinfo *sndrcvinfo;
+
+						sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+						data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream;
+						data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn;
+						data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags;
+						data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid;
+						data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn;
+						data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn;
+						data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context;
+						}
+#endif
+					}
+				}
+
+			if (n <= 0)
+				{
+				if (n < 0)
+					ret = n;
+				break;
+				}
+
+			if (msg.msg_flags & MSG_NOTIFICATION)
+				{
+				snp = (union sctp_notification*) out;
+				if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+					{
+#ifdef SCTP_EVENT
+					struct sctp_event event;
+#else
+					struct sctp_event_subscribe event;
+					socklen_t eventsize;
+#endif
+					/* If a message has been delayed until the socket
+					 * is dry, it can be sent now.
+					 */
+					if (data->saved_message.length > 0)
+						{
+						dgram_sctp_write(data->saved_message.bio, data->saved_message.data,
+						                 data->saved_message.length);
+						OPENSSL_free(data->saved_message.data);
+						data->saved_message.length = 0;
+						}
+
+					/* disable sender dry event */
+#ifdef SCTP_EVENT
+					memset(&event, 0, sizeof(struct sctp_event));
+					event.se_assoc_id = 0;
+					event.se_type = SCTP_SENDER_DRY_EVENT;
+					event.se_on = 0;
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+					OPENSSL_assert(i >= 0);
+#else
+					eventsize = sizeof(struct sctp_event_subscribe);
+					i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+					OPENSSL_assert(i >= 0);
+
+					event.sctp_sender_dry_event = 0;
+
+					i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+					OPENSSL_assert(i >= 0);
+#endif
+					}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+				if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+					dgram_sctp_handle_auth_free_key_event(b, snp);
+#endif
+
+				if (data->handle_notifications != NULL)
+					data->handle_notifications(b, data->notification_context, (void*) out);
+
+				memset(out, 0, outl);
+				}
+			else
+				ret += n;
+			}
+		while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR) && (ret < outl));
+
+		if (ret > 0 && !(msg.msg_flags & MSG_EOR))
+			{
+			/* Partial message read, this should never happen! */
+
+			/* The buffer was too small, this means the peer sent
+			 * a message that was larger than allowed. */
+			if (ret == outl)
+				return -1;
+
+			/* Test if socket buffer can handle max record
+			 * size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Test if SCTP doesn't partially deliver below
+			 * max record size (2^14 + 2048 + 13)
+			 */
+			optlen = (socklen_t) sizeof(int);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT,
+			                 &optval, &optlen);
+			OPENSSL_assert(ret >= 0);
+			OPENSSL_assert(optval >= 18445);
+
+			/* Partially delivered notification??? Probably a bug.... */
+			OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION));
+
+			/* Everything seems ok till now, so it's most likely
+			 * a message dropped by PR-SCTP.
+			 */
+			memset(out, 0, outl);
+			BIO_set_retry_read(b);
+			return -1;
+			}
+
+		BIO_clear_retry_flags(b);
+		if (ret < 0)
+			{
+			if (BIO_dgram_should_retry(ret))
+				{
+				BIO_set_retry_read(b);
+				data->_errno = get_last_socket_error();
+				}
+			}
+
+		/* Test if peer uses SCTP-AUTH before continuing */
+		if (!data->peer_auth_tested)
+			{
+			int ii, auth_data = 0, auth_forward = 0;
+			unsigned char *p;
+			struct sctp_authchunks *authchunks;
+
+			optlen = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t));
+			authchunks = OPENSSL_malloc(optlen);
+			memset(authchunks, 0, sizeof(optlen));
+			ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS, authchunks, &optlen);
+			OPENSSL_assert(ii >= 0);
+
+			for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t);
+				 p < (unsigned char*) authchunks + optlen;
+				 p += sizeof(uint8_t))
+				{
+				if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1;
+				if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1;
+				}
+
+			OPENSSL_free(authchunks);
+
+			if (!auth_data || !auth_forward)
+				{
+				BIOerr(BIO_F_DGRAM_SCTP_READ,BIO_R_CONNECT_ERROR);
+				return -1;
+				}
+
+			data->peer_auth_tested = 1;
+			}
+		}
+	return(ret);
+	}
+
+static int dgram_sctp_write(BIO *b, const char *in, int inl)
+	{
+	int ret;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+	struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo);
+	struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo);
+	struct bio_dgram_sctp_sndinfo handshake_sinfo;
+	struct iovec iov[1];
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) + CMSG_SPACE(sizeof(struct sctp_prinfo))];
+	struct sctp_sndinfo *sndinfo;
+	struct sctp_prinfo *prinfo;
+#else
+	char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))];
+	struct sctp_sndrcvinfo *sndrcvinfo;
+#endif
+
+	clear_socket_error();
+
+	/* If we're send anything else than application data,
+	 * disable all user parameters and flags.
+	 */
+	if (in[0] != 23) {
+		memset(&handshake_sinfo, 0x00, sizeof(struct bio_dgram_sctp_sndinfo));
+#ifdef SCTP_SACK_IMMEDIATELY
+		handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY;
+#endif
+		sinfo = &handshake_sinfo;
+	}
+
+	/* If we have to send a shutdown alert message and the
+	 * socket is not dry yet, we have to save it and send it
+	 * as soon as the socket gets dry.
+	 */
+	if (data->save_shutdown && !BIO_dgram_sctp_wait_for_dry(b))
+	{
+		data->saved_message.bio = b;
+		data->saved_message.length = inl;
+		data->saved_message.data = OPENSSL_malloc(inl);
+		memcpy(data->saved_message.data, in, inl);
+		return inl;
+	}
+
+	iov[0].iov_base = (char *)in;
+	iov[0].iov_len = inl;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = (caddr_t)cmsgbuf;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO)
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo));
+	sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg);
+	memset(sndinfo, 0, sizeof(struct sctp_sndinfo));
+	sndinfo->snd_sid = sinfo->snd_sid;
+	sndinfo->snd_flags = sinfo->snd_flags;
+	sndinfo->snd_ppid = sinfo->snd_ppid;
+	sndinfo->snd_context = sinfo->snd_context;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo));
+
+	cmsg = (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))];
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_PRINFO;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo));
+	prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg);
+	memset(prinfo, 0, sizeof(struct sctp_prinfo));
+	prinfo->pr_policy = pinfo->pr_policy;
+	prinfo->pr_value = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo));
+#else
+	cmsg = (struct cmsghdr *)cmsgbuf;
+	cmsg->cmsg_level = IPPROTO_SCTP;
+	cmsg->cmsg_type = SCTP_SNDRCV;
+	cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg);
+	memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo));
+	sndrcvinfo->sinfo_stream = sinfo->snd_sid;
+	sndrcvinfo->sinfo_flags = sinfo->snd_flags;
+#ifdef __FreeBSD__
+	sndrcvinfo->sinfo_flags |= pinfo->pr_policy;
+#endif
+	sndrcvinfo->sinfo_ppid = sinfo->snd_ppid;
+	sndrcvinfo->sinfo_context = sinfo->snd_context;
+	sndrcvinfo->sinfo_timetolive = pinfo->pr_value;
+	msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo));
+#endif
+
+	ret = sendmsg(b->num, &msg, 0);
+
+	BIO_clear_retry_flags(b);
+	if (ret <= 0)
+		{
+		if (BIO_dgram_should_retry(ret))
+			{
+			BIO_set_retry_write(b);  
+			data->_errno = get_last_socket_error();
+			}
+		}
+	return(ret);
+	}
+
+static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr)
+	{
+	long ret=1;
+	bio_dgram_sctp_data *data = NULL;
+	unsigned int sockopt_len = 0;
+	struct sctp_authkeyid authkeyid;
+	struct sctp_authkey *authkey;
+
+	data = (bio_dgram_sctp_data *)b->ptr;
+
+	switch (cmd)
+		{
+	case BIO_CTRL_DGRAM_QUERY_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore user input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_MTU:
+		/* Set to maximum (2^14)
+		 * and ignore input to enable transport
+		 * protocol fragmentation.
+		 * Returns always 2^14.
+		 */
+		data->mtu = 16384;
+		ret = data->mtu;
+		break;
+	case BIO_CTRL_DGRAM_SET_CONNECTED:
+	case BIO_CTRL_DGRAM_CONNECT:
+		/* Returns always -1. */
+		ret = -1;
+		break;
+	case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT:
+		/* SCTP doesn't need the DTLS timer
+		 * Returns always 1.
+		 */
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE:
+		if (num > 0)
+			data->in_handshake = 1;
+		else
+			data->in_handshake = 0;
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY, &data->in_handshake, sizeof(int));
+		break;
+	case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY:
+		/* New shared key for SCTP AUTH.
+		 * Returns 0 on success, -1 otherwise.
+		 */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Add new key */
+		sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t);
+		authkey = OPENSSL_malloc(sockopt_len);
+		memset(authkey, 0x00, sockopt_len);
+		authkey->sca_keynumber = authkeyid.scact_keynumber + 1;
+#ifndef __FreeBSD__
+		/* This field is missing in FreeBSD 8.2 and earlier,
+		 * and FreeBSD 8.3 and higher work without it.
+		 */
+		authkey->sca_keylength = 64;
+#endif
+		memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t));
+
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len);
+		if (ret < 0) break;
+
+		/* Reset active key */
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		break;
+	case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Get active key */
+		sockopt_len = sizeof(struct sctp_authkeyid);
+		ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+		if (ret < 0) break;
+
+		/* Set active key */
+		authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1;
+		ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY,
+		      &authkeyid, sizeof(struct sctp_authkeyid));
+		if (ret < 0) break;
+
+		/* CCS has been sent, so remember that and fall through
+		 * to check if we need to deactivate an old key
+		 */
+		data->ccs_sent = 1;
+
+	case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD:
+		/* Returns 0 on success, -1 otherwise. */
+
+		/* Has this command really been called or is this just a fall-through? */
+		if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD)
+			data->ccs_rcvd = 1;
+
+		/* CSS has been both, received and sent, so deactivate an old key */
+		if (data->ccs_rcvd == 1 && data->ccs_sent == 1)
+			{
+			/* Get active key */
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len);
+			if (ret < 0) break;
+
+			/* Deactivate key or delete second last key if
+			 * SCTP_AUTHENTICATION_EVENT is not available.
+			 */
+			authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+#ifdef SCTP_AUTH_DEACTIVATE_KEY
+			sockopt_len = sizeof(struct sctp_authkeyid);
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY,
+			      &authkeyid, sockopt_len);
+			if (ret < 0) break;
+#endif
+#ifndef SCTP_AUTHENTICATION_EVENT
+			if (authkeyid.scact_keynumber > 0)
+				{
+				authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1;
+				ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY,
+					  &authkeyid, sizeof(struct sctp_authkeyid));
+				if (ret < 0) break;
+				}
+#endif
+
+			data->ccs_rcvd = 0;
+			data->ccs_sent = 0;
+			}
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(ptr, &(data->sndinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo))
+			num = sizeof(struct bio_dgram_sctp_sndinfo);
+
+		memcpy(&(data->sndinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(ptr, &data->rcvinfo, num);
+
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo))
+			num = sizeof(struct bio_dgram_sctp_rcvinfo);
+
+		memcpy(&(data->rcvinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_GET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(ptr, &(data->prinfo), num);
+		ret = num;
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SET_PRINFO:
+		/* Returns the size of the copied struct. */
+		if (num > (long) sizeof(struct bio_dgram_sctp_prinfo))
+			num = sizeof(struct bio_dgram_sctp_prinfo);
+
+		memcpy(&(data->prinfo), ptr, num);
+		break;
+	case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN:
+		/* Returns always 1. */
+		if (num > 0)
+			data->save_shutdown = 1;
+		else
+			data->save_shutdown = 0;
+		break;
+
+	default:
+		/* Pass to default ctrl function to
+		 * process SCTP unspecific commands
+		 */
+		ret=dgram_ctrl(b, cmd, num, ptr);
+		break;
+		}
+	return(ret);
+	}
+
+int BIO_dgram_sctp_notification_cb(BIO *b,
+                                   void (*handle_notifications)(BIO *bio, void *context, void *buf),
+                                   void *context)
+	{
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr;
+
+	if (handle_notifications != NULL)
+		{
+		data->handle_notifications = handle_notifications;
+		data->notification_context = context;
+		}
+	else
+		return -1;
+
+	return 0;
+	}
+
+int BIO_dgram_sctp_wait_for_dry(BIO *b)
+{
+	int is_dry = 0;
+	int n, sockflags, ret;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+#ifdef SCTP_EVENT
+	struct sctp_event event;
+#else
+	struct sctp_event_subscribe event;
+	socklen_t eventsize;
+#endif
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* set sender dry event */
+#ifdef SCTP_EVENT
+	memset(&event, 0, sizeof(struct sctp_event));
+	event.se_assoc_id = 0;
+	event.se_type = SCTP_SENDER_DRY_EVENT;
+	event.se_on = 1;
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+	eventsize = sizeof(struct sctp_event_subscribe);
+	ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+	if (ret < 0)
+		return -1;
+	
+	event.sctp_sender_dry_event = 1;
+	
+	ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+	if (ret < 0)
+		return -1;
+
+	/* peek for notification */
+	memset(&snp, 0x00, sizeof(union sctp_notification));
+	iov.iov_base = (char *)&snp;
+	iov.iov_len = sizeof(union sctp_notification);
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags = 0;
+
+	n = recvmsg(b->num, &msg, MSG_PEEK);
+	if (n <= 0)
+		{
+		if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+			return -1;
+		else
+			return 0;
+		}
+
+	/* if we find a notification, process it and try again if necessary */
+	while (msg.msg_flags & MSG_NOTIFICATION)
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		n = recvmsg(b->num, &msg, 0);
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		
+		if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT)
+			{
+			is_dry = 1;
+
+			/* disable sender dry event */
+#ifdef SCTP_EVENT
+			memset(&event, 0, sizeof(struct sctp_event));
+			event.se_assoc_id = 0;
+			event.se_type = SCTP_SENDER_DRY_EVENT;
+			event.se_on = 0;
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event));
+#else
+			eventsize = (socklen_t) sizeof(struct sctp_event_subscribe);
+			ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize);
+			if (ret < 0)
+				return -1;
+
+			event.sctp_sender_dry_event = 0;
+
+			ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe));
+#endif
+			if (ret < 0)
+				return -1;
+			}
+
+#ifdef SCTP_AUTHENTICATION_EVENT
+		if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+			dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+		if (data->handle_notifications != NULL)
+			data->handle_notifications(b, data->notification_context, (void*) &snp);
+
+		/* found notification, peek again */
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		/* if we have seen the dry already, don't wait */
+		if (is_dry)
+			{
+			sockflags = fcntl(b->num, F_GETFL, 0);
+			fcntl(b->num, F_SETFL, O_NONBLOCK);
+			}
+
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+
+		if (is_dry)
+			{
+			fcntl(b->num, F_SETFL, sockflags);
+			}
+
+		if (n <= 0)
+			{
+			if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK))
+				return -1;
+			else
+				return is_dry;
+			}
+		}
+
+	/* read anything else */
+	return is_dry;
+}
+
+int BIO_dgram_sctp_msg_waiting(BIO *b)
+	{
+	int n, sockflags;
+	union sctp_notification snp;
+	struct msghdr msg;
+	struct iovec iov;
+	bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr;
+
+	/* Check if there are any messages waiting to be read */
+	do
+		{
+		memset(&snp, 0x00, sizeof(union sctp_notification));
+		iov.iov_base = (char *)&snp;
+		iov.iov_len = sizeof(union sctp_notification);
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = &iov;
+		msg.msg_iovlen = 1;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = 0;
+
+		sockflags = fcntl(b->num, F_GETFL, 0);
+		fcntl(b->num, F_SETFL, O_NONBLOCK);
+		n = recvmsg(b->num, &msg, MSG_PEEK);
+		fcntl(b->num, F_SETFL, sockflags);
+
+		/* if notification, process and try again */
+		if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION))
+			{
+#ifdef SCTP_AUTHENTICATION_EVENT
+			if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT)
+				dgram_sctp_handle_auth_free_key_event(b, &snp);
+#endif
+
+			memset(&snp, 0x00, sizeof(union sctp_notification));
+			iov.iov_base = (char *)&snp;
+			iov.iov_len = sizeof(union sctp_notification);
+			msg.msg_name = NULL;
+			msg.msg_namelen = 0;
+			msg.msg_iov = &iov;
+			msg.msg_iovlen = 1;
+			msg.msg_control = NULL;
+			msg.msg_controllen = 0;
+			msg.msg_flags = 0;
+			n = recvmsg(b->num, &msg, 0);
+
+			if (data->handle_notifications != NULL)
+				data->handle_notifications(b, data->notification_context, (void*) &snp);
+			}
+
+		} while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION));
+
+	/* Return 1 if there is a message to be read, return 0 otherwise. */
+	if (n > 0)
+		return 1;
+	else
+		return 0;
+	}
+
+static int dgram_sctp_puts(BIO *bp, const char *str)
+	{
+	int n,ret;
+
+	n=strlen(str);
+	ret=dgram_sctp_write(bp,str,n);
+	return(ret);
+	}
+#endif
+
 static int BIO_dgram_should_retry(int i)
 	{
 	int err;
diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
new file mode 100644
index 0000000000..c52e0b75b5
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,278 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	mul_1x1_neon,%function
+.align	5
+mul_1x1_neon:
+	vshl.u64	`&Dlo("q1")`,d16,#8	@ q1-q3 are slided $a
+	vmull.p8	`&Q("d0")`,d16,d17	@ a�bb
+	vshl.u64	`&Dlo("q2")`,d16,#16
+	vmull.p8	q1,`&Dlo("q1")`,d17	@ a<<8�bb
+	vshl.u64	`&Dlo("q3")`,d16,#24
+	vmull.p8	q2,`&Dlo("q2")`,d17	@ a<<16�bb
+	vshr.u64	`&Dlo("q1")`,#8
+	vmull.p8	q3,`&Dlo("q3")`,d17	@ a<<24�bb
+	vshl.u64	`&Dhi("q1")`,#24
+	veor		d0,`&Dlo("q1")`
+	vshr.u64	`&Dlo("q2")`,#16
+	veor		d0,`&Dhi("q1")`
+	vshl.u64	`&Dhi("q2")`,#16
+	veor		d0,`&Dlo("q2")`
+	vshr.u64	`&Dlo("q3")`,#24
+	veor		d0,`&Dhi("q2")`
+	vshl.u64	`&Dhi("q3")`,#8
+	veor		d0,`&Dlo("q3")`
+	veor		d0,`&Dhi("q3")`
+	bx	lr
+.size	mul_1x1_neon,.-mul_1x1_neon
+#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	$a0,#0
+	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
+	str	$a0,[sp,#0]		@ tab[0]=0
+	add	$a2,$a1,$a1		@ a2=a1<<1
+	str	$a1,[sp,#4]		@ tab[1]=a1
+	eor	$a12,$a1,$a2		@ a1^a2
+	str	$a2,[sp,#8]		@ tab[2]=a2
+	mov	$a4,$a1,lsl#2		@ a4=a1<<2
+	str	$a12,[sp,#12]		@ tab[3]=a1^a2
+	eor	$a14,$a1,$a4		@ a1^a4
+	str	$a4,[sp,#16]		@ tab[4]=a4
+	eor	$a0,$a2,$a4		@ a2^a4
+	str	$a14,[sp,#20]		@ tab[5]=a1^a4
+	eor	$a12,$a12,$a4		@ a1^a2^a4
+	str	$a0,[sp,#24]		@ tab[6]=a2^a4
+	and	$i0,$mask,$b,lsl#2
+	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	$i1,$mask,$b,lsr#1
+	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
+	and	$i0,$mask,$b,lsr#4
+	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
+	and	$i1,$mask,$b,lsr#7
+	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
+	eor	$lo,$lo,$t1,lsl#3	@ stall
+	mov	$hi,$t1,lsr#29
+	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
+
+	and	$i0,$mask,$b,lsr#10
+	eor	$lo,$lo,$t0,lsl#6
+	eor	$hi,$hi,$t0,lsr#26
+	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
+
+	and	$i1,$mask,$b,lsr#13
+	eor	$lo,$lo,$t1,lsl#9
+	eor	$hi,$hi,$t1,lsr#23
+	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
+
+	and	$i0,$mask,$b,lsr#16
+	eor	$lo,$lo,$t0,lsl#12
+	eor	$hi,$hi,$t0,lsr#20
+	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
+
+	and	$i1,$mask,$b,lsr#19
+	eor	$lo,$lo,$t1,lsl#15
+	eor	$hi,$hi,$t1,lsr#17
+	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
+
+	and	$i0,$mask,$b,lsr#22
+	eor	$lo,$lo,$t0,lsl#18
+	eor	$hi,$hi,$t0,lsr#14
+	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
+
+	and	$i1,$mask,$b,lsr#25
+	eor	$lo,$lo,$t1,lsl#21
+	eor	$hi,$hi,$t1,lsr#11
+	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
+
+	tst	$a,#1<<30
+	and	$i0,$mask,$b,lsr#28
+	eor	$lo,$lo,$t0,lsl#24
+	eor	$hi,$hi,$t0,lsr#8
+	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
+
+	eorne	$lo,$lo,$b,lsl#30
+	eorne	$hi,$hi,$b,lsr#2
+	tst	$a,#1<<31
+	eor	$lo,$lo,$t1,lsl#27
+	eor	$hi,$hi,$t1,lsr#5
+	eorne	$lo,$lo,$b,lsl#31
+	eorne	$hi,$hi,$b,lsr#1
+	eor	$lo,$lo,$t0,lsl#30
+	eor	$hi,$hi,$t0,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void	bn_GF2m_mul_2x2(BN_ULONG *r,
+#	BN_ULONG a1,BN_ULONG a0,
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0�b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+
+$code.=<<___;
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	beq	.Lialu
+
+	veor	$A1,$A1
+	vmov.32	$B1,r3,r3		@ two copies of b1
+	vmov.32	${A1}[0],r1		@ a1
+
+	veor	$A0,$A0
+	vld1.32	${B0}[],[sp,:32]	@ two copies of b0
+	vmov.32	${A0}[0],r2		@ a0
+	mov	r12,lr
+
+	vmov	d16,$A1
+	vmov	d17,$B1
+	bl	mul_1x1_neon		@ a1�b1
+	vmov	$A1B1,d0
+
+	vmov	d16,$A0
+	vmov	d17,$B0
+	bl	mul_1x1_neon		@ a0�b0
+	vmov	$A0B0,d0
+
+	veor	d16,$A0,$A1
+	veor	d17,$B0,$B1
+	veor	$A0,$A0B0,$A1B1
+	bl	mul_1x1_neon		@ (a0+a1)�(b0+b1)
+
+	veor	d0,$A0			@ (a0+a1)�(b0+b1)-a0�b0-a1�b1
+	vshl.u64 d1,d0,#32
+	vshr.u64 d0,d0,#32
+	veor	$A0B0,d1
+	veor	$A1B1,d0
+	vst1.32	{${A0B0}[0]},[r0,:32]!
+	vst1.32	{${A0B0}[1]},[r0,:32]!
+	vst1.32	{${A1B1}[0]},[r0,:32]!
+	vst1.32	{${A1B1}[1]},[r0,:32]
+	bx	r12
+.align	4
+.Lialu:
+#endif
+___
+$ret="r10";	# reassigned 1st argument
+$code.=<<___;
+	stmdb	sp!,{r4-r10,lr}
+	mov	$ret,r0			@ reassign 1st argument
+	mov	$b,r3			@ $b=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	$mask,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1�b1
+	str	$lo,[$ret,#8]
+	str	$hi,[$ret,#12]
+
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
+	bl	mul_1x1_ialu		@ a0�b0
+	str	$lo,[$ret]
+	str	$hi,[$ret,#4]
+
+	eor	$a,$a,r2
+	eor	$b,$b,r3
+	bl	mul_1x1_ialu		@ (a1+a0)�(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+	ldmia	$ret,{@r[0]-@r[3]}
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
+	str	$hi,[$ret,#8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
+	str	$lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+.comm	OPENSSL_armcap_P,4,4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
+print $code;
+close STDOUT;   # enforce flush
diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl
index 14e0d2d1dd..f78a8b5f0f 100644
--- a/src/lib/libcrypto/bn/asm/armv4-mont.pl
+++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl
@@ -23,6 +23,9 @@
 # than 1/2KB. Windows CE port would be trivial, as it's exclusively
 # about decorations, ABI and instruction syntax are identical.
 
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $num="r0";	# starts as num argument, but holds &tp[num-1]
 $ap="r1";
 $bp="r2"; $bi="r2"; $rp="r2";
@@ -89,9 +92,9 @@ bn_mul_mont:
 .L1st:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
 	adds	$nlo,$nlo,$alo
@@ -101,21 +104,21 @@ bn_mul_mont:
 	bne	.L1st
 
 	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
 	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
 	adc	$nhi,$nhi,#0
-	ldr	$tp,[$_bp]		@ restore bp
 	str	$nlo,[$num]		@ tp[num-1]=
-	ldr	$n0,[$_n0]		@ restore n0
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 .Louter:
 	sub	$tj,$num,sp		@ "original" $num-1 value
 	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
-	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
 	ldr	$aj,[$ap,#-4]		@ ap[0]
-	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
 	ldr	$tj,[sp,#4]		@ tp[1]
 
 	mov	$ahi,#0
@@ -129,13 +132,13 @@ bn_mul_mont:
 .Linner:
 	ldr	$aj,[$ap],#4		@ ap[j],ap++
 	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$ahi,#0
 	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
-	ldr	$nj,[$np],#4		@ np[j],np++
 	mov	$nhi,#0
 	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
-	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
 	adds	$nlo,$nlo,$alo
 	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
 	adc	$nlo,$nhi,#0
@@ -144,13 +147,13 @@ bn_mul_mont:
 
 	adds	$nlo,$nlo,$ahi
 	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
 	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
 	adds	$nlo,$nlo,$tj
-	adc	$nhi,$nhi,#0
-	ldr	$tp,[$_bp]		@ restore bp
 	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
 	str	$nlo,[$num]		@ tp[num-1]=
-	ldr	$n0,[$_n0]		@ restore n0
 	str	$nhi,[$num,#4]		@ tp[num]=
 
 	cmp	$tp,$tj
diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl
new file mode 100644
index 0000000000..e258658428
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
+# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//		    const BN_ULONG *bp,const BN_ULONG *np,
+//		    const BN_ULONG *n0p,int num);			
+.align	64
+.global	bn_mul_mont#
+.proc	bn_mul_mont#
+bn_mul_mont:
+	.prologue
+	.body
+{ .mmi;	cmp4.le		p6,p7=2,r37;;
+(p6)	cmp4.lt.unc	p8,p9=8,r37
+	mov		ret0=r0		};;
+{ .bbb;
+(p9)	br.cond.dptk.many	bn_mul_mont_8
+(p8)	br.cond.dpnt.many	bn_mul_mont_general
+(p7)	br.ret.spnt.many	b0	};;
+.endp	bn_mul_mont#
+
+prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
+
+rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
+tptr=r16;	// &tp[0]
+tp_1=r17;	// &tp[-1]
+num=r18;	len=r19;	lc=r20;
+topbit=r21;	// carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align	64
+.local	bn_mul_mont_general#
+.proc	bn_mul_mont_general#
+bn_mul_mont_general:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,6,2,0,8
+	$ADDP	aptr=0,in1
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		}
+{ .mmi;	.vframe	prevsp
+	mov	prevsp=sp
+	$ADDP	bptr=0,in2
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
+	.rotr		a[3],n[3],t[2]
+
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		alo[4]=[aptr],16	// ap[0]
+	$ADDP		r30=8,in1	};;
+{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
+	ldf8		alo[2]=[aptr],16	// ap[2]
+	$ADDP		in4=0,in4	};;
+{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
+	ldf8		n0=[in4]		// n0
+	$ADDP		rptr=0,in0		}
+{ .mmi;	$ADDP		nptr=0,in3
+	mov		r31=16
+	zxt4		num=in5		};;
+{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
+	shladd		len=num,3,r0
+	shladd		r31=num,3,r31	};;
+{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
+	add		lc=-5,num
+	sub		r31=sp,r31	};;
+{ .mfb;	and		sp=-16,r31		// alloca
+	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
+	nop.b		0		}
+{ .mfb;	nop.m		0
+	xmpy.lu		alo[4]=alo[4],bi
+	brp.loop.imp	.L1st_ctop,.L1st_cend-16
+					};;
+{ .mfi;	nop.m		0
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
+	add		tp_1=8,sp	}
+{ .mfi;	nop.m		0
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20001f<<16
+			// ------^----- (p40) at first (p23)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	nop.m		0
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;	nop.m		0
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+.align	32
+.L1st_ctop:
+.pred.rel	"mutex",p40,p42
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
+{ .mfi;	(p23)	st8		[tp_1]=n[2],8
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	br.ctop.sptk	.L1st_ctop			};;
+.L1st_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	add		num=-1,num	};;	// num--
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+	sub		nptr=nptr,len	};;
+{ .mmi;	.pred.rel	"mutex",p39,p41
+(p39)	add		topbit=r0,r0
+(p41)	add		topbit=r0,r0,1
+	nop.i		0		}	
+{ .mmi;	st8		[tp_1]=n[0]
+	add		tptr=16,sp
+	add		tp_1=8,sp	};;
+
+.Louter:
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		ahi[3]=[tptr]		// tp[0]
+	add		r30=8,aptr	};;
+{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
+	ldf8		alo[3]=[r30],16		// ap[1]
+	add		r31=8,nptr	};;
+{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
+	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
+	brp.loop.imp	.Linner_ctop,.Linner_cend-16
+					}
+{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
+	xma.lu		alo[4]=alo[4],bi,ahi[3]
+	clrrrb.pr			};;
+{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
+	nop.i		0		}
+{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20101f<<16
+			// ------^----- (p40) at first (p23)
+			// --------^--- (p30) at first (p22)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align	32
+.Linner_ctop:
+.pred.rel	"mutex",p40,p42
+.pred.rel	"mutex",p30,p32
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
+{ .mfi;	(p16)	nop.m		0
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p16)	nop.f		0
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mfi;	(p21)	ld8		t[0]=[tptr],8
+	(p16)	nop.f		0
+	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
+{ .mfi;	(p16)	nop.m		0
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
+{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
+{ .mmb;	(p23)	st8		[tp_1]=n[2],8
+	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
+	br.ctop.sptk	.Linner_ctop			};;
+.Linner_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	nop.i		0		};;
+
+{ .mmi;	.pred.rel	"mutex",p31,p33
+(p31)	add		a[0]=a[0],topbit
+(p33)	add		a[0]=a[0],topbit,1
+	mov		topbit=r0	};;
+{ .mfi; .pred.rel	"mutex",p31,p33
+(p31)	cmp.ltu		p32,p30=a[0],topbit
+(p33)	cmp.leu		p32,p30=a[0],topbit
+					}
+{ .mfi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+					};;
+{ .mmi;	.pred.rel	"mutex",p44,p46
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+(p32)	add		topbit=r0,r0,1	}
+
+{ .mmi;	st8		[tp_1]=n[0],8
+	cmp4.ne		p6,p0=1,num
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	sub		nptr=nptr,len
+(p41)	add		topbit=r0,r0,1
+	add		tptr=16,sp	}
+{ .mmb;	add		tp_1=8,sp
+	add		num=-1,num		// num--
+(p6)	br.cond.sptk.many	.Louter	};;
+
+{ .mbb;	add		lc=4,lc
+	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
+	clrrrb.pr			};;
+{ .mii;	nop.m		0
+	mov		pr.rot=0x10001<<16
+			// ------^---- (p33) at first (p17)
+	mov		ar.lc=lc	}
+{ .mii;	nop.m		0
+	mov		ar.ec=3
+	nop.i		0		};;
+
+.Lsub_ctop:
+.pred.rel	"mutex",p33,p35
+{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
+	(p16)	nop.f		0
+	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
+{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
+	(p16)	nop.f		0
+	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
+{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
+	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
+	(p18)	nop.b		0			}
+{ .mib;	(p18)	nop.m		0
+	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
+	br.ctop.sptk	.Lsub_ctop			};;
+.Lsub_cend:
+
+{ .mmb;	.pred.rel	"mutex",p34,p36
+(p34)	sub	topbit=topbit,r0	// (p19)
+(p36)	sub	topbit=topbit,r0,1
+	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
+					}
+{ .mmb;	sub	rptr=rptr,len		// rewind
+	sub	tptr=tptr,len
+	clrrrb.pr			};;
+{ .mmi;	and	aptr=tptr,topbit
+	andcm	bptr=rptr,topbit
+	mov	pr.rot=1<<16		};;
+{ .mii;	or	nptr=aptr,bptr
+	mov	ar.lc=lc
+	mov	ar.ec=3			};;
+
+.Lcopy_ctop:
+{ .mmb;	(p16)	ld8	n[0]=[nptr],8
+	(p18)	st8	[tptr]=r0,8
+	(p16)	nop.b	0		}
+{ .mmb;	(p16)	nop.m	0
+	(p18)	st8	[rptr]=n[2],8
+	br.ctop.sptk	.Lcopy_ctop	};;
+.Lcopy_cend:
+
+{ .mmi;	mov		ret0=1			// signal "handled"
+	rum		1<<5			// clear um.mfh
+	mov		ar.lc=prevlc	}
+{ .mib;	.restore	sp
+	mov		sp=prevsp
+	mov		pr=prevpr,0x1ffff
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_general#
+
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align	64
+.skip	48		// aligns loop body
+.local	bn_mul_mont_8#
+.proc	bn_mul_mont_8#
+bn_mul_mont_8:
+	.prologue
+{ .mmi;	.save		ar.pfs,prevfs
+	alloc		prevfs=ar.pfs,6,2,0,8
+	.vframe		prevsp
+	mov		prevsp=sp
+	.save		ar.lc,prevlc
+	mov		prevlc=ar.lc	}
+{ .mmi;	add		r17=-6*16,sp
+	add		sp=-7*16,sp
+	.save		pr,prevpr
+	mov		prevpr=pr	};;
+
+{ .mmi;	.save.gf	0,0x10
+	stf.spill	[sp]=f16,-16
+	.save.gf	0,0x20
+	stf.spill	[r17]=f17,32
+	add		r16=-5*16,prevsp};;
+{ .mmi;	.save.gf	0,0x40
+	stf.spill	[r16]=f18,32
+	.save.gf	0,0x80
+	stf.spill	[r17]=f19,32
+	$ADDP		aptr=0,in1	};;
+{ .mmi;	.save.gf	0,0x100
+	stf.spill	[r16]=f20,32
+	.save.gf	0,0x200
+	stf.spill	[r17]=f21,32
+	$ADDP		r29=8,in1	};;
+{ .mmi;	.save.gf	0,0x400
+	stf.spill	[r16]=f22
+	.save.gf	0,0x800
+	stf.spill	[r17]=f23
+	$ADDP		rptr=0,in0	};;
+
+	.body
+	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+	.rotr		t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
+	ldf8		ai1=[r29],16		// ap[1]
+	$ADDP		bptr=0,in2	}
+{ .mmi;	$ADDP		r30=8,in2
+	$ADDP		nptr=0,in3
+	$ADDP		r31=8,in3	};;
+{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
+	ldf8		bj[6]=[r30],16		// bp[1]
+	cmp4.le		p4,p5=3,in5	}
+{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
+	ldf8		ni1=[r31],16		// np[1]
+	cmp4.le		p6,p7=4,in5	};;
+
+{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
+	(p5)fcvt.fxu	ai2=f0
+	cmp4.le		p8,p9=5,in5	}
+{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
+	(p7)fcvt.fxu	ai3=f0
+	cmp4.le		p10,p11=6,in5	}
+{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
+	(p5)fcvt.fxu	bj[5]=f0
+	cmp4.le		p12,p13=7,in5	}
+{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
+	(p7)fcvt.fxu	bj[4]=f0
+	cmp4.le		p14,p15=8,in5	}
+{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
+	(p5)fcvt.fxu	ni2=f0
+	addp4		r28=-1,in5	}
+{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
+	(p7)fcvt.fxu	ni3=f0
+	$ADDP		in4=0,in4	};;
+
+{ .mfi;	ldf8		n0=[in4]
+	fcvt.fxu	tf[1]=f0
+	nop.i		0		}
+
+{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
+	(p9)fcvt.fxu	ai4=f0
+	mov		t[0]=r0		}
+{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
+	(p11)fcvt.fxu	ai5=f0
+	mov		t[1]=r0		}
+{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
+	(p9)fcvt.fxu	bj[3]=f0
+	mov		t[2]=r0		}
+{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
+	(p11)fcvt.fxu	bj[2]=f0
+	mov		t[3]=r0		}
+{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
+	(p9)fcvt.fxu	ni4=f0
+	mov		t[4]=r0		}
+{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
+	(p11)fcvt.fxu	ni5=f0
+	mov		t[5]=r0		};;
+
+{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
+	(p13)fcvt.fxu	ai6=f0
+	mov		t[6]=r0		}
+{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
+	(p15)fcvt.fxu	ai7=f0
+	mov		t[7]=r0		}
+{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
+	(p13)fcvt.fxu	bj[1]=f0
+	mov		ar.lc=r28	}
+{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
+	(p15)fcvt.fxu	bj[0]=f0
+	mov		ar.ec=1		}
+{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
+	(p13)fcvt.fxu	ni6=f0
+	mov		pr.rot=1<<16	}
+{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
+	(p15)fcvt.fxu	ni7=f0
+	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
+					};;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 0:
+	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
+	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
+{ .mfi;	(p42)	add		a3=a3,n3,1
+	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 4:
+	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
+	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
+{ .mfi;	(p43)	add		a4=a4,n4,1
+	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p16)	nop.m		0			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 8:
+	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
+	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
+{ .mfi;	(p42)	add		a5=a5,n5,1
+	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mfi;	(p16)	nop.m		0			// 10:
+	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
+	(p40)	cmp.ltu		p43,p41=a5,n5	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
+	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
+	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
+{ .mfi;	(p43)	add		a6=a6,n6,1
+	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mfi;	(p16)	nop.m		0			// 14:
+	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
+	(p41)	cmp.ltu		p42,p40=a6,n6	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
+	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	nop.m		0			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 16:
+	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
+	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
+{ .mfi;	(p42)	add		a7=a7,n7,1
+	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mfi;	(p16)	nop.m		0			// 18:
+	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
+	(p40)	cmp.ltu		p43,p41=a7,n7	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
+	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 20:
+	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
+	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
+{ .mfi;	(p43)	add		a8=a8,n8,1
+	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	};;
+{ .mfi;	(p16)	nop.m		0			// 22:
+	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
+	(p41)	cmp.ltu		p42,p40=a8,n8	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
+	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	};;
+{ .mfi;	(p16)	nop.m		0			// 24:
+	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
+	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
+{ .mfi;	(p16)	nop.m		0
+	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
+	(p17)	mov		t[0]=r0		};;
+{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
+	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
+	(p42)	add		t[0]=t[0],r0,1	};;
+{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
+	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
+	(p50)	add		t[0]=t[0],r0,1	}
+{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
+	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
+	(p16)	cmp.ltu.unc	p50,p48=t0,a1
+	(p16)	nop.i		0		};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 28:
+	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
+	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
+{ .mfi;	(p42)	add		a2=a2,n2,1
+	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
+	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
+	(p50)	add		t[6]=t[6],a2,1	};;
+{ .mfi;	(p16)	nop.m		0			// 30:
+	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
+	(p40)	cmp.ltu		p41,p39=a2,n2	}
+{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
+	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
+	(p16)	nop.f		0
+	(p48)	cmp.ltu		p49,p47=t[6],a2	}
+{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
+	(p16)	nop.f		0
+	br.ctop.sptk.many	.Louter_8_ctop	};;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
+	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
+	(p42)	add		a3=a3,n3,1	};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
+	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
+	(p43)	add		a4=a4,n4,1	};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p0)	nop.f		0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
+	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
+	(p42)	add		a5=a5,n5,1	};;
+{ .mii;	(p0)	nop.m		0			// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mii;	(p0)	nop.m		0			// 10:
+	(p40)	cmp.ltu		p43,p41=a5,n5
+	(p42)	cmp.leu		p43,p41=a5,n5	};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
+	(p43)	add		a6=a6,n6,1	};;
+{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mii;	(p0)	nop.m		0			// 14:
+	(p41)	cmp.ltu		p42,p40=a6,n6
+	(p43)	cmp.leu		p42,p40=a6,n6	};;
+{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	nop.m		0			// 16:
+	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
+	(p42)	add		a7=a7,n7,1	};;
+{ .mii;	(p0)	nop.m		0			// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mii;	(p0)	nop.m		0			// 18:
+	(p40)	cmp.ltu		p43,p41=a7,n7
+	(p42)	cmp.leu		p43,p41=a7,n7	};;
+{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p0)	nop.m		0			// 20:
+	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
+	(p43)	add		a8=a8,n8,1	};;
+{ .mmi;	(p0)	nop.m		0			// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	}
+{ .mmi;	(p17)	mov		t[0]=r0
+	(p41)	cmp.ltu		p42,p40=a8,n8
+	(p43)	cmp.leu		p42,p40=a8,n8	};;
+{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	}
+{ .mmi;	(p42)	add		t[0]=t[0],r0,1
+	(p0)	add		r16=-7*16,prevsp
+	(p0)	add		r17=-6*16,prevsp	};;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi;	(p50)add	t[0]=t[0],r0,1
+	add		r18=-5*16,prevsp
+	sub		n1=t0,n1	};;
+{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n2=t[7],n2
+	(p34)sub	n2=t[7],n2,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
+	(p34)cmp.geu	p35,p33=n2,t[7];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n3=t[6],n3	}
+{ .mmi;	(p35)sub	n3=t[6],n3,1;;
+	(p33)cmp.gtu	p34,p32=n3,t[6]
+	(p35)cmp.geu	p34,p32=n3,t[6]	};;
+	.pred.rel	"mutex",p32,p34
+{ .mii;	(p32)sub	n4=t[5],n4
+	(p34)sub	n4=t[5],n4,1;;
+	(p32)cmp.gtu	p35,p33=n4,t[5]	}
+{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n5=t[4],n5
+	(p35)sub	n5=t[4],n5,1	};;
+{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
+	(p35)cmp.geu	p34,p32=n5,t[4];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n6=t[3],n6	}
+{ .mmi;	(p34)sub	n6=t[3],n6,1;;
+	(p32)cmp.gtu	p35,p33=n6,t[3]
+	(p34)cmp.geu	p35,p33=n6,t[3]	};;
+	.pred.rel	"mutex",p33,p35
+{ .mii;	(p33)sub	n7=t[2],n7
+	(p35)sub	n7=t[2],n7,1;;
+	(p33)cmp.gtu	p34,p32=n7,t[2]	}
+{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n8=t[1],n8
+	(p34)sub	n8=t[1],n8,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
+	(p34)cmp.geu	p35,p33=n8,t[1];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	a8=t[0],r0	}
+{ .mmi;	(p35)sub	a8=t[0],r0,1;;
+	(p33)cmp.gtu	p34,p32=a8,t[0]
+	(p35)cmp.geu	p34,p32=a8,t[0]	};;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+	.pred.rel	"mutex",p32,p34
+{ .mmi;	(p32)st8	[rptr]=n1,8
+	(p34)st8	[rptr]=t0,8
+	add		r19=-4*16,prevsp};;
+{ .mmb;	(p32)st8	[rptr]=n2,8
+	(p34)st8	[rptr]=t[7],8
+	(p5)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n3,8
+	(p34)st8	[rptr]=t[6],8
+	(p7)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n4,8
+	(p34)st8	[rptr]=t[5],8
+	(p9)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n5,8
+	(p34)st8	[rptr]=t[4],8
+	(p11)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n6,8
+	(p34)st8	[rptr]=t[3],8
+	(p13)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n7,8
+	(p34)st8	[rptr]=t[2],8
+	(p15)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n8,8
+	(p34)st8	[rptr]=t[1],8
+	nop.b		0		};;
+.Ldone:						// epilogue
+{ .mmi;	ldf.fill	f16=[r16],64
+	ldf.fill	f17=[r17],64
+	nop.i		0		}
+{ .mmi;	ldf.fill	f18=[r18],64
+	ldf.fill	f19=[r19],64
+	mov		pr=prevpr,0x1ffff	};;
+{ .mmi;	ldf.fill	f20=[r16]
+	ldf.fill	f21=[r17]
+	mov		ar.lc=prevlc	}
+{ .mmi;	ldf.fill	f22=[r18]
+	ldf.fill	f23=[r19]
+	mov		ret0=1		}	// signal "handled"
+{ .mib;	rum		1<<5
+	.restore	sp
+	mov		sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_8#
+
+.type	copyright#,\@object
+copyright:
+stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl
new file mode 100644
index 0000000000..b944a12b8e
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/mips-mont.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+	$LD="ld";
+	$ST="sd";
+	$MULTU="dmultu";
+	$ADDU="daddu";
+	$SUBU="dsubu";
+	$BNSZ=8;
+} else {
+	$LD="lw";
+	$ST="sw";
+	$MULTU="multu";
+	$ADDU="addu";
+	$SUBU="subu";
+	$BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0;	# BN_ULONG *rp,
+$ap=$a1;	# const BN_ULONG *ap,
+$bp=$a2;	# const BN_ULONG *bp,
+$np=$a3;	# const BN_ULONG *np,
+$n0=$a4;	# const BN_ULONG *n0,
+$num=$a5;	# int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+	lw	$n0,16($sp)
+	lw	$num,20($sp)
+___
+$code.=<<___;
+	slt	$at,$num,4
+	bnez	$at,1f
+	li	$t0,0
+	slt	$at,$num,17	# on in-order CPU
+	bnezl	$at,bn_mul_mont_internal
+	nop
+1:	jr	$ra
+	li	$a0,0
+.end	bn_mul_mont
+
+.align	5
+.ent	bn_mul_mont_internal
+bn_mul_mont_internal:
+	.frame	$fp,$FRAMESIZE*$SZREG,$ra
+	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
+	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	move	$fp,$sp
+
+	.set	reorder
+	$LD	$n0,0($n0)
+	$LD	$bi,0($bp)	# bp[0]
+	$LD	$aj,0($ap)	# ap[0]
+	$LD	$nj,0($np)	# np[0]
+
+	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
+	sll	$num,`log($BNSZ)/log(2)`
+	li	$at,-4096
+	$PTR_SUB $sp,$num
+	and	$sp,$at
+
+	$MULTU	$aj,$bi
+	$LD	$alo,$BNSZ($ap)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$MULTU	$lo0,$n0
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+.align	4
+.L1st:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$MULTU	$nj,$m1
+	$ADDU	$hi1,$at
+	addu	$j,$BNSZ
+	$ST	$lo1,($tp)
+	sltu	$t0,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+
+	bnez	$t0,.L1st
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$t0
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+
+	$ST	$lo1,($tp)
+
+	$ADDU	$hi1,$hi0
+	sltu	$at,$hi1,$hi0
+	$ST	$hi1,$BNSZ($tp)
+	$ST	$at,2*$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+.Louter:
+	$PTR_ADD $bi,$bp,$i
+	$LD	$bi,($bi)
+	$LD	$aj,($ap)
+	$LD	$alo,$BNSZ($ap)
+	$LD	$tj,($sp)
+
+	$MULTU	$aj,$bi
+	$LD	$nj,($np)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$ADDU	$lo0,$tj
+	$MULTU	$lo0,$n0
+	sltu	$at,$lo0,$tj
+	$ADDU	$hi0,$at
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+	$LD	$tj,$BNSZ($tp)
+.align	4
+.Linner:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo0,$tj
+	addu	$j,$BNSZ
+	$MULTU	$nj,$m1
+	sltu	$at,$lo0,$tj
+	$ADDU	$lo1,$lo0
+	$ADDU	$hi0,$at
+	sltu	$t0,$lo1,$lo0
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$hi1,$t0
+	sltu	$at,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+	$ST	$lo1,($tp)
+	bnez	$at,.Linner
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$lo0,$tj
+	sltu	$t0,$lo0,$tj
+	$ADDU	$hi0,$t0
+
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$at
+	$ADDU	$lo1,$lo0
+	sltu	$t0,$lo1,$lo0
+	$ADDU	$hi1,$t0
+	$ST	$lo1,($tp)
+
+	$ADDU	$lo1,$hi1,$hi0
+	sltu	$hi1,$lo1,$hi0
+	$ADDU	$lo1,$tj
+	sltu	$at,$lo1,$tj
+	$ADDU	$hi1,$at
+	$ST	$lo1,$BNSZ($tp)
+	$ST	$hi1,2*$BNSZ($tp)
+
+	addu	$i,$BNSZ
+	sltu	$t0,$i,$num
+	bnez	$t0,.Louter
+
+	.set	noreorder
+	$PTR_ADD $tj,$sp,$num	# &tp[num]
+	move	$tp,$sp
+	move	$ap,$sp
+	li	$hi0,0		# clear borrow bit
+
+.align	4
+.Lsub:	$LD	$lo0,($tp)
+	$LD	$lo1,($np)
+	$PTR_ADD $tp,$BNSZ
+	$PTR_ADD $np,$BNSZ
+	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	$at,$lo1,$lo0
+	$SUBU	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	$ST	$lo0,($rp)
+	or	$hi0,$at
+	sltu	$at,$tp,$tj
+	bnez	$at,.Lsub
+	$PTR_ADD $rp,$BNSZ
+
+	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,$sp
+	$PTR_SUB $rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,$sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	$LD	$aj,($ap)
+	$PTR_ADD $ap,$BNSZ
+	$ST	$zero,($tp)
+	$PTR_ADD $tp,$BNSZ
+	sltu	$at,$tp,$tj
+	$ST	$aj,($rp)
+	bnez	$at,.Lcopy
+	$PTR_ADD $rp,$BNSZ
+
+	li	$a0,1
+	li	$t0,1
+
+	.set	noreorder
+	move	$sp,$fp
+	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end	bn_mul_mont_internal
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl
new file mode 100644
index 0000000000..c162a3ec23
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/mips.pl
@@ -0,0 +1,2585 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project.
+#
+# Rights for redistribution and usage in source and binary forms are
+# granted according to the OpenSSL license. Warranty of any kind is
+# disclaimed.
+# ====================================================================
+
+
+# July 1999
+#
+# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
+#
+# The module is designed to work with either of the "new" MIPS ABI(5),
+# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
+# IRIX 5.x not only because it doesn't support new ABIs but also
+# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
+# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
+# cause illegal instruction exception:-(
+#
+# In addition the code depends on preprocessor flags set up by MIPSpro
+# compiler driver (either as or cc) and therefore (probably?) can't be
+# compiled by the GNU assembler. GNU C driver manages fine though...
+# I mean as long as -mmips-as is specified or is the default option,
+# because then it simply invokes /usr/bin/as which in turn takes
+# perfect care of the preprocessor definitions. Another neat feature
+# offered by the MIPSpro assembler is an optimization pass. This gave
+# me the opportunity to have the code looking more regular as all those
+# architecture dependent instruction rescheduling details were left to
+# the assembler. Cool, huh?
+#
+# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
+# goes way over 3 times faster!
+#
+#					<appro@fy.chalmers.se>
+
+# October 2010
+#
+# Adapt the module even for 32-bit ABIs and other OSes. The former was
+# achieved by mechanical replacement of 64-bit arithmetic instructions
+# such as dmultu, daddu, etc. with their 32-bit counterparts and
+# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
+# >3x performance improvement naturally does not apply to 32-bit code
+# [because there is no instruction 32-bit compiler can't use], one
+# has to content with 40-85% improvement depending on benchmark and
+# key length, more for longer keys.
+
+$flavour = shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+	$LD="ld";
+	$ST="sd";
+	$MULTU="dmultu";
+	$DIVU="ddivu";
+	$ADDU="daddu";
+	$SUBU="dsubu";
+	$SRL="dsrl";
+	$SLL="dsll";
+	$BNSZ=8;
+	$PTR_ADD="daddu";
+	$PTR_SUB="dsubu";
+	$SZREG=8;
+	$REG_S="sd";
+	$REG_L="ld";
+} else {
+	$LD="lw";
+	$ST="sw";
+	$MULTU="multu";
+	$DIVU="divu";
+	$ADDU="addu";
+	$SUBU="subu";
+	$SRL="srl";
+	$SLL="sll";
+	$BNSZ=4;
+	$PTR_ADD="addu";
+	$PTR_SUB="subu";
+	$SZREG=4;
+	$REG_S="sw";
+	$REG_L="lw";
+	$code=".set	mips2\n";
+}
+
+# Below is N32/64 register layout used in the original module.
+#
+($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
+#
+# No special adaptation is required for O32. NUBI on the other hand
+# is treated by saving/restoring ($v1,$t0..$t3).
+
+$gp=$v1 if ($flavour =~ /nubi/i);
+
+$minus4=$v1;
+
+$code.=<<___;
+.rdata
+.asciiz	"mips3.s, Version 1.2"
+.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
+
+.text
+.set	noat
+
+.align	5
+.globl	bn_mul_add_words
+.ent	bn_mul_add_words
+bn_mul_add_words:
+	.set	noreorder
+	bgtz	$a2,bn_mul_add_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_add_words
+
+.align	5
+.ent	bn_mul_add_words_internal
+bn_mul_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$ta0,$a2,$minus4
+	$LD	$t0,0($a1)
+	beqz	$ta0,.L_bn_mul_add_words_tail
+
+.L_bn_mul_add_words_loop:
+	$MULTU	$t0,$a3
+	$LD	$t1,0($a0)
+	$LD	$t2,$BNSZ($a1)
+	$LD	$t3,$BNSZ($a0)
+	$LD	$ta0,2*$BNSZ($a1)
+	$LD	$ta1,2*$BNSZ($a0)
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
+				# values", but it seems to work fine
+				# even on 64-bit registers.
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	 $MULTU	$t2,$a3
+	sltu	$at,$t1,$at
+	$ST	$t1,0($a0)
+	$ADDU	$v0,$at
+
+	$LD	$ta2,3*$BNSZ($a1)
+	$LD	$ta3,3*$BNSZ($a0)
+	$ADDU	$t3,$v0
+	sltu	$v0,$t3,$v0
+	mflo	$at
+	mfhi	$t2
+	$ADDU	$t3,$at
+	$ADDU	$v0,$t2
+	 $MULTU	$ta0,$a3
+	sltu	$at,$t3,$at
+	$ST	$t3,$BNSZ($a0)
+	$ADDU	$v0,$at
+
+	subu	$a2,4
+	$PTR_ADD $a0,4*$BNSZ
+	$PTR_ADD $a1,4*$BNSZ
+	$ADDU	$ta1,$v0
+	sltu	$v0,$ta1,$v0
+	mflo	$at
+	mfhi	$ta0
+	$ADDU	$ta1,$at
+	$ADDU	$v0,$ta0
+	 $MULTU	$ta2,$a3
+	sltu	$at,$ta1,$at
+	$ST	$ta1,-2*$BNSZ($a0)
+	$ADDU	$v0,$at
+
+
+	and	$ta0,$a2,$minus4
+	$ADDU	$ta3,$v0
+	sltu	$v0,$ta3,$v0
+	mflo	$at
+	mfhi	$ta2
+	$ADDU	$ta3,$at
+	$ADDU	$v0,$ta2
+	sltu	$at,$ta3,$at
+	$ST	$ta3,-$BNSZ($a0)
+	$ADDU	$v0,$at
+	.set	noreorder
+	bgtzl	$ta0,.L_bn_mul_add_words_loop
+	$LD	$t0,0($a1)
+
+	beqz	$a2,.L_bn_mul_add_words_return
+	nop
+
+.L_bn_mul_add_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$a3
+	$LD	$t1,0($a0)
+	subu	$a2,1
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	sltu	$at,$t1,$at
+	$ST	$t1,0($a0)
+	$ADDU	$v0,$at
+	beqz	$a2,.L_bn_mul_add_words_return
+
+	$LD	$t0,$BNSZ($a1)
+	$MULTU	$t0,$a3
+	$LD	$t1,$BNSZ($a0)
+	subu	$a2,1
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	sltu	$at,$t1,$at
+	$ST	$t1,$BNSZ($a0)
+	$ADDU	$v0,$at
+	beqz	$a2,.L_bn_mul_add_words_return
+
+	$LD	$t0,2*$BNSZ($a1)
+	$MULTU	$t0,$a3
+	$LD	$t1,2*$BNSZ($a0)
+	$ADDU	$t1,$v0
+	sltu	$v0,$t1,$v0
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$t1,$at
+	$ADDU	$v0,$t0
+	sltu	$at,$t1,$at
+	$ST	$t1,2*$BNSZ($a0)
+	$ADDU	$v0,$at
+
+.L_bn_mul_add_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_add_words_internal
+
+.align	5
+.globl	bn_mul_words
+.ent	bn_mul_words
+bn_mul_words:
+	.set	noreorder
+	bgtz	$a2,bn_mul_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_words
+
+.align	5
+.ent	bn_mul_words_internal
+bn_mul_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$ta0,$a2,$minus4
+	$LD	$t0,0($a1)
+	beqz	$ta0,.L_bn_mul_words_tail
+
+.L_bn_mul_words_loop:
+	$MULTU	$t0,$a3
+	$LD	$t2,$BNSZ($a1)
+	$LD	$ta0,2*$BNSZ($a1)
+	$LD	$ta2,3*$BNSZ($a1)
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	 $MULTU	$t2,$a3
+	$ST	$v0,0($a0)
+	$ADDU	$v0,$t1,$t0
+
+	subu	$a2,4
+	$PTR_ADD $a0,4*$BNSZ
+	$PTR_ADD $a1,4*$BNSZ
+	mflo	$at
+	mfhi	$t2
+	$ADDU	$v0,$at
+	sltu	$t3,$v0,$at
+	 $MULTU	$ta0,$a3
+	$ST	$v0,-3*$BNSZ($a0)
+	$ADDU	$v0,$t3,$t2
+
+	mflo	$at
+	mfhi	$ta0
+	$ADDU	$v0,$at
+	sltu	$ta1,$v0,$at
+	 $MULTU	$ta2,$a3
+	$ST	$v0,-2*$BNSZ($a0)
+	$ADDU	$v0,$ta1,$ta0
+
+	and	$ta0,$a2,$minus4
+	mflo	$at
+	mfhi	$ta2
+	$ADDU	$v0,$at
+	sltu	$ta3,$v0,$at
+	$ST	$v0,-$BNSZ($a0)
+	$ADDU	$v0,$ta3,$ta2
+	.set	noreorder
+	bgtzl	$ta0,.L_bn_mul_words_loop
+	$LD	$t0,0($a1)
+
+	beqz	$a2,.L_bn_mul_words_return
+	nop
+
+.L_bn_mul_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$a3
+	subu	$a2,1
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	$ST	$v0,0($a0)
+	$ADDU	$v0,$t1,$t0
+	beqz	$a2,.L_bn_mul_words_return
+
+	$LD	$t0,$BNSZ($a1)
+	$MULTU	$t0,$a3
+	subu	$a2,1
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	$ST	$v0,$BNSZ($a0)
+	$ADDU	$v0,$t1,$t0
+	beqz	$a2,.L_bn_mul_words_return
+
+	$LD	$t0,2*$BNSZ($a1)
+	$MULTU	$t0,$a3
+	mflo	$at
+	mfhi	$t0
+	$ADDU	$v0,$at
+	sltu	$t1,$v0,$at
+	$ST	$v0,2*$BNSZ($a0)
+	$ADDU	$v0,$t1,$t0
+
+.L_bn_mul_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_mul_words_internal
+
+.align	5
+.globl	bn_sqr_words
+.ent	bn_sqr_words
+bn_sqr_words:
+	.set	noreorder
+	bgtz	$a2,bn_sqr_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_sqr_words
+
+.align	5
+.ent	bn_sqr_words_internal
+bn_sqr_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$ta0,$a2,$minus4
+	$LD	$t0,0($a1)
+	beqz	$ta0,.L_bn_sqr_words_tail
+
+.L_bn_sqr_words_loop:
+	$MULTU	$t0,$t0
+	$LD	$t2,$BNSZ($a1)
+	$LD	$ta0,2*$BNSZ($a1)
+	$LD	$ta2,3*$BNSZ($a1)
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,0($a0)
+	$ST	$t0,$BNSZ($a0)
+
+	$MULTU	$t2,$t2
+	subu	$a2,4
+	$PTR_ADD $a0,8*$BNSZ
+	$PTR_ADD $a1,4*$BNSZ
+	mflo	$t3
+	mfhi	$t2
+	$ST	$t3,-6*$BNSZ($a0)
+	$ST	$t2,-5*$BNSZ($a0)
+
+	$MULTU	$ta0,$ta0
+	mflo	$ta1
+	mfhi	$ta0
+	$ST	$ta1,-4*$BNSZ($a0)
+	$ST	$ta0,-3*$BNSZ($a0)
+
+
+	$MULTU	$ta2,$ta2
+	and	$ta0,$a2,$minus4
+	mflo	$ta3
+	mfhi	$ta2
+	$ST	$ta3,-2*$BNSZ($a0)
+	$ST	$ta2,-$BNSZ($a0)
+
+	.set	noreorder
+	bgtzl	$ta0,.L_bn_sqr_words_loop
+	$LD	$t0,0($a1)
+
+	beqz	$a2,.L_bn_sqr_words_return
+	nop
+
+.L_bn_sqr_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$MULTU	$t0,$t0
+	subu	$a2,1
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,0($a0)
+	$ST	$t0,$BNSZ($a0)
+	beqz	$a2,.L_bn_sqr_words_return
+
+	$LD	$t0,$BNSZ($a1)
+	$MULTU	$t0,$t0
+	subu	$a2,1
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,2*$BNSZ($a0)
+	$ST	$t0,3*$BNSZ($a0)
+	beqz	$a2,.L_bn_sqr_words_return
+
+	$LD	$t0,2*$BNSZ($a1)
+	$MULTU	$t0,$t0
+	mflo	$t1
+	mfhi	$t0
+	$ST	$t1,4*$BNSZ($a0)
+	$ST	$t0,5*$BNSZ($a0)
+
+.L_bn_sqr_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+
+.end	bn_sqr_words_internal
+
+.align	5
+.globl	bn_add_words
+.ent	bn_add_words
+bn_add_words:
+	.set	noreorder
+	bgtz	$a3,bn_add_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$v0
+.end	bn_add_words
+
+.align	5
+.ent	bn_add_words_internal
+bn_add_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$at,$a3,$minus4
+	$LD	$t0,0($a1)
+	beqz	$at,.L_bn_add_words_tail
+
+.L_bn_add_words_loop:
+	$LD	$ta0,0($a2)
+	subu	$a3,4
+	$LD	$t1,$BNSZ($a1)
+	and	$at,$a3,$minus4
+	$LD	$t2,2*$BNSZ($a1)
+	$PTR_ADD $a2,4*$BNSZ
+	$LD	$t3,3*$BNSZ($a1)
+	$PTR_ADD $a0,4*$BNSZ
+	$LD	$ta1,-3*$BNSZ($a2)
+	$PTR_ADD $a1,4*$BNSZ
+	$LD	$ta2,-2*$BNSZ($a2)
+	$LD	$ta3,-$BNSZ($a2)
+	$ADDU	$ta0,$t0
+	sltu	$t8,$ta0,$t0
+	$ADDU	$t0,$ta0,$v0
+	sltu	$v0,$t0,$ta0
+	$ST	$t0,-4*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+	$ADDU	$ta1,$t1
+	sltu	$t9,$ta1,$t1
+	$ADDU	$t1,$ta1,$v0
+	sltu	$v0,$t1,$ta1
+	$ST	$t1,-3*$BNSZ($a0)
+	$ADDU	$v0,$t9
+
+	$ADDU	$ta2,$t2
+	sltu	$t8,$ta2,$t2
+	$ADDU	$t2,$ta2,$v0
+	sltu	$v0,$t2,$ta2
+	$ST	$t2,-2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+	
+	$ADDU	$ta3,$t3
+	sltu	$t9,$ta3,$t3
+	$ADDU	$t3,$ta3,$v0
+	sltu	$v0,$t3,$ta3
+	$ST	$t3,-$BNSZ($a0)
+	$ADDU	$v0,$t9
+	
+	.set	noreorder
+	bgtzl	$at,.L_bn_add_words_loop
+	$LD	$t0,0($a1)
+
+	beqz	$a3,.L_bn_add_words_return
+	nop
+
+.L_bn_add_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$LD	$ta0,0($a2)
+	$ADDU	$ta0,$t0
+	subu	$a3,1
+	sltu	$t8,$ta0,$t0
+	$ADDU	$t0,$ta0,$v0
+	sltu	$v0,$t0,$ta0
+	$ST	$t0,0($a0)
+	$ADDU	$v0,$t8
+	beqz	$a3,.L_bn_add_words_return
+
+	$LD	$t1,$BNSZ($a1)
+	$LD	$ta1,$BNSZ($a2)
+	$ADDU	$ta1,$t1
+	subu	$a3,1
+	sltu	$t9,$ta1,$t1
+	$ADDU	$t1,$ta1,$v0
+	sltu	$v0,$t1,$ta1
+	$ST	$t1,$BNSZ($a0)
+	$ADDU	$v0,$t9
+	beqz	$a3,.L_bn_add_words_return
+
+	$LD	$t2,2*$BNSZ($a1)
+	$LD	$ta2,2*$BNSZ($a2)
+	$ADDU	$ta2,$t2
+	sltu	$t8,$ta2,$t2
+	$ADDU	$t2,$ta2,$v0
+	sltu	$v0,$t2,$ta2
+	$ST	$t2,2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+.L_bn_add_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+
+.end	bn_add_words_internal
+
+.align	5
+.globl	bn_sub_words
+.ent	bn_sub_words
+bn_sub_words:
+	.set	noreorder
+	bgtz	$a3,bn_sub_words_internal
+	move	$v0,$zero
+	jr	$ra
+	move	$a0,$zero
+.end	bn_sub_words
+
+.align	5
+.ent	bn_sub_words_internal
+bn_sub_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	li	$minus4,-4
+	and	$at,$a3,$minus4
+	$LD	$t0,0($a1)
+	beqz	$at,.L_bn_sub_words_tail
+
+.L_bn_sub_words_loop:
+	$LD	$ta0,0($a2)
+	subu	$a3,4
+	$LD	$t1,$BNSZ($a1)
+	and	$at,$a3,$minus4
+	$LD	$t2,2*$BNSZ($a1)
+	$PTR_ADD $a2,4*$BNSZ
+	$LD	$t3,3*$BNSZ($a1)
+	$PTR_ADD $a0,4*$BNSZ
+	$LD	$ta1,-3*$BNSZ($a2)
+	$PTR_ADD $a1,4*$BNSZ
+	$LD	$ta2,-2*$BNSZ($a2)
+	$LD	$ta3,-$BNSZ($a2)
+	sltu	$t8,$t0,$ta0
+	$SUBU	$ta0,$t0,$ta0
+	$SUBU	$t0,$ta0,$v0
+	sgtu	$v0,$t0,$ta0
+	$ST	$t0,-4*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+	sltu	$t9,$t1,$ta1
+	$SUBU	$ta1,$t1,$ta1
+	$SUBU	$t1,$ta1,$v0
+	sgtu	$v0,$t1,$ta1
+	$ST	$t1,-3*$BNSZ($a0)
+	$ADDU	$v0,$t9
+
+
+	sltu	$t8,$t2,$ta2
+	$SUBU	$ta2,$t2,$ta2
+	$SUBU	$t2,$ta2,$v0
+	sgtu	$v0,$t2,$ta2
+	$ST	$t2,-2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+	sltu	$t9,$t3,$ta3
+	$SUBU	$ta3,$t3,$ta3
+	$SUBU	$t3,$ta3,$v0
+	sgtu	$v0,$t3,$ta3
+	$ST	$t3,-$BNSZ($a0)
+	$ADDU	$v0,$t9
+
+	.set	noreorder
+	bgtzl	$at,.L_bn_sub_words_loop
+	$LD	$t0,0($a1)
+
+	beqz	$a3,.L_bn_sub_words_return
+	nop
+
+.L_bn_sub_words_tail:
+	.set	reorder
+	$LD	$t0,0($a1)
+	$LD	$ta0,0($a2)
+	subu	$a3,1
+	sltu	$t8,$t0,$ta0
+	$SUBU	$ta0,$t0,$ta0
+	$SUBU	$t0,$ta0,$v0
+	sgtu	$v0,$t0,$ta0
+	$ST	$t0,0($a0)
+	$ADDU	$v0,$t8
+	beqz	$a3,.L_bn_sub_words_return
+
+	$LD	$t1,$BNSZ($a1)
+	subu	$a3,1
+	$LD	$ta1,$BNSZ($a2)
+	sltu	$t9,$t1,$ta1
+	$SUBU	$ta1,$t1,$ta1
+	$SUBU	$t1,$ta1,$v0
+	sgtu	$v0,$t1,$ta1
+	$ST	$t1,$BNSZ($a0)
+	$ADDU	$v0,$t9
+	beqz	$a3,.L_bn_sub_words_return
+
+	$LD	$t2,2*$BNSZ($a1)
+	$LD	$ta2,2*$BNSZ($a2)
+	sltu	$t8,$t2,$ta2
+	$SUBU	$ta2,$t2,$ta2
+	$SUBU	$t2,$ta2,$v0
+	sgtu	$v0,$t2,$ta2
+	$ST	$t2,2*$BNSZ($a0)
+	$ADDU	$v0,$t8
+
+.L_bn_sub_words_return:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_sub_words_internal
+
+.align 5
+.globl	bn_div_3_words
+.ent	bn_div_3_words
+bn_div_3_words:
+	.set	noreorder
+	move	$a3,$a0		# we know that bn_div_words does not
+				# touch $a3, $ta2, $ta3 and preserves $a2
+				# so that we can save two arguments
+				# and return address in registers
+				# instead of stack:-)
+				
+	$LD	$a0,($a3)
+	move	$ta2,$a1
+	bne	$a0,$a2,bn_div_3_words_internal
+	$LD	$a1,-$BNSZ($a3)
+	li	$v0,-1
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_3_words
+
+.align	5
+.ent	bn_div_3_words_internal
+bn_div_3_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	move	$ta3,$ra
+	bal	bn_div_words
+	move	$ra,$ta3
+	$MULTU	$ta2,$v0
+	$LD	$t2,-2*$BNSZ($a3)
+	move	$ta0,$zero
+	mfhi	$t1
+	mflo	$t0
+	sltu	$t8,$t1,$a1
+.L_bn_div_3_words_inner_loop:
+	bnez	$t8,.L_bn_div_3_words_inner_loop_done
+	sgeu	$at,$t2,$t0
+	seq	$t9,$t1,$a1
+	and	$at,$t9
+	sltu	$t3,$t0,$ta2
+	$ADDU	$a1,$a2
+	$SUBU	$t1,$t3
+	$SUBU	$t0,$ta2
+	sltu	$t8,$t1,$a1
+	sltu	$ta0,$a1,$a2
+	or	$t8,$ta0
+	.set	noreorder
+	beqzl	$at,.L_bn_div_3_words_inner_loop
+	$SUBU	$v0,1
+	.set	reorder
+.L_bn_div_3_words_inner_loop_done:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_3_words_internal
+
+.align	5
+.globl	bn_div_words
+.ent	bn_div_words
+bn_div_words:
+	.set	noreorder
+	bnez	$a2,bn_div_words_internal
+	li	$v0,-1		# I would rather signal div-by-zero
+				# which can be done with 'break 7'
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_words
+
+.align	5
+.ent	bn_div_words_internal
+bn_div_words_internal:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	move	$v1,$zero
+	bltz	$a2,.L_bn_div_words_body
+	move	$t9,$v1
+	$SLL	$a2,1
+	bgtz	$a2,.-4
+	addu	$t9,1
+
+	.set	reorder
+	negu	$t1,$t9
+	li	$t2,-1
+	$SLL	$t2,$t1
+	and	$t2,$a0
+	$SRL	$at,$a1,$t1
+	.set	noreorder
+	bnezl	$t2,.+8
+	break	6		# signal overflow
+	.set	reorder
+	$SLL	$a0,$t9
+	$SLL	$a1,$t9
+	or	$a0,$at
+___
+$QT=$ta0;
+$HH=$ta1;
+$DH=$v1;
+$code.=<<___;
+.L_bn_div_words_body:
+	$SRL	$DH,$a2,4*$BNSZ	# bits
+	sgeu	$at,$a0,$a2
+	.set	noreorder
+	bnezl	$at,.+8
+	$SUBU	$a0,$a2
+	.set	reorder
+
+	li	$QT,-1
+	$SRL	$HH,$a0,4*$BNSZ	# bits
+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
+	beq	$DH,$HH,.L_bn_div_words_skip_div1
+	$DIVU	$zero,$a0,$DH
+	mflo	$QT
+.L_bn_div_words_skip_div1:
+	$MULTU	$a2,$QT
+	$SLL	$t3,$a0,4*$BNSZ	# bits
+	$SRL	$at,$a1,4*$BNSZ	# bits
+	or	$t3,$at
+	mflo	$t0
+	mfhi	$t1
+.L_bn_div_words_inner_loop1:
+	sltu	$t2,$t3,$t0
+	seq	$t8,$HH,$t1
+	sltu	$at,$HH,$t1
+	and	$t2,$t8
+	sltu	$v0,$t0,$a2
+	or	$at,$t2
+	.set	noreorder
+	beqz	$at,.L_bn_div_words_inner_loop1_done
+	$SUBU	$t1,$v0
+	$SUBU	$t0,$a2
+	b	.L_bn_div_words_inner_loop1
+	$SUBU	$QT,1
+	.set	reorder
+.L_bn_div_words_inner_loop1_done:
+
+	$SLL	$a1,4*$BNSZ	# bits
+	$SUBU	$a0,$t3,$t0
+	$SLL	$v0,$QT,4*$BNSZ	# bits
+
+	li	$QT,-1
+	$SRL	$HH,$a0,4*$BNSZ	# bits
+	$SRL	$QT,4*$BNSZ	# q=0xffffffff
+	beq	$DH,$HH,.L_bn_div_words_skip_div2
+	$DIVU	$zero,$a0,$DH
+	mflo	$QT
+.L_bn_div_words_skip_div2:
+	$MULTU	$a2,$QT
+	$SLL	$t3,$a0,4*$BNSZ	# bits
+	$SRL	$at,$a1,4*$BNSZ	# bits
+	or	$t3,$at
+	mflo	$t0
+	mfhi	$t1
+.L_bn_div_words_inner_loop2:
+	sltu	$t2,$t3,$t0
+	seq	$t8,$HH,$t1
+	sltu	$at,$HH,$t1
+	and	$t2,$t8
+	sltu	$v1,$t0,$a2
+	or	$at,$t2
+	.set	noreorder
+	beqz	$at,.L_bn_div_words_inner_loop2_done
+	$SUBU	$t1,$v1
+	$SUBU	$t0,$a2
+	b	.L_bn_div_words_inner_loop2
+	$SUBU	$QT,1
+	.set	reorder
+.L_bn_div_words_inner_loop2_done:
+
+	$SUBU	$a0,$t3,$t0
+	or	$v0,$QT
+	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
+	$SRL	$a2,$t9		# restore $a2
+
+	.set	noreorder
+	move	$a1,$v1
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	move	$a0,$v0
+.end	bn_div_words_internal
+___
+undef $HH; undef $QT; undef $DH;
+
+($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
+($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
+
+($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
+($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
+
+($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
+
+$code.=<<___;
+
+.align	5
+.globl	bn_mul_comba8
+.ent	bn_mul_comba8
+bn_mul_comba8:
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,12*$SZREG,$ra
+	.mask	0x803ff008,-$SZREG
+	$PTR_SUB $sp,12*$SZREG
+	$REG_S	$ra,11*$SZREG($sp)
+	$REG_S	$s5,10*$SZREG($sp)
+	$REG_S	$s4,9*$SZREG($sp)
+	$REG_S	$s3,8*$SZREG($sp)
+	$REG_S	$s2,7*$SZREG($sp)
+	$REG_S	$s1,6*$SZREG($sp)
+	$REG_S	$s0,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x003f0000,-$SZREG
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$s5,5*$SZREG($sp)
+	$REG_S	$s4,4*$SZREG($sp)
+	$REG_S	$s3,3*$SZREG($sp)
+	$REG_S	$s2,2*$SZREG($sp)
+	$REG_S	$s1,1*$SZREG($sp)
+	$REG_S	$s0,0*$SZREG($sp)
+___
+$code.=<<___;
+
+	.set	reorder
+	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
+				# R5000 box assembler barks on this
+				# 1ine with "should not have mult/div
+				# as last instruction in bb (R10K
+				# bug)" warning. If anybody out there
+				# has a clue about how to circumvent
+				# this do send me a note.
+				#		<appro\@fy.chalmers.se>
+
+	$LD	$b_0,0($a2)
+	$LD	$a_1,$BNSZ($a1)
+	$LD	$a_2,2*$BNSZ($a1)
+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_3,3*$BNSZ($a1)
+	$LD	$b_1,$BNSZ($a2)
+	$LD	$b_2,2*$BNSZ($a2)
+	$LD	$b_3,3*$BNSZ($a2)
+	mflo	$c_1
+	mfhi	$c_2
+
+	$LD	$a_4,4*$BNSZ($a1)
+	$LD	$a_5,5*$BNSZ($a1)
+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
+	$LD	$a_6,6*$BNSZ($a1)
+	$LD	$a_7,7*$BNSZ($a1)
+	$LD	$b_4,4*$BNSZ($a2)
+	$LD	$b_5,5*$BNSZ($a2)
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
+	$ADDU	$c_3,$t_2,$at
+	$LD	$b_6,6*$BNSZ($a2)
+	$LD	$b_7,7*$BNSZ($a2)
+	$ST	$c_1,0($a0)	# r[0]=c1;
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
+	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s5,10*$SZREG($sp)
+	$REG_L	$s4,9*$SZREG($sp)
+	$REG_L	$s3,8*$SZREG($sp)
+	$REG_L	$s2,7*$SZREG($sp)
+	$REG_L	$s1,6*$SZREG($sp)
+	$REG_L	$s0,5*$SZREG($sp)
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	jr	$ra
+	$PTR_ADD $sp,12*$SZREG
+___
+$code.=<<___ if ($flavour !~ /nubi/i);
+	$REG_L	$s5,5*$SZREG($sp)
+	$REG_L	$s4,4*$SZREG($sp)
+	$REG_L	$s3,3*$SZREG($sp)
+	$REG_L	$s2,2*$SZREG($sp)
+	$REG_L	$s1,1*$SZREG($sp)
+	$REG_L	$s0,0*$SZREG($sp)
+	jr	$ra
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+.end	bn_mul_comba8
+
+.align	5
+.globl	bn_mul_comba4
+.ent	bn_mul_comba4
+bn_mul_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	$LD	$a_0,0($a1)
+	$LD	$b_0,0($a2)
+	$LD	$a_1,$BNSZ($a1)
+	$LD	$a_2,2*$BNSZ($a1)
+	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_3,3*$BNSZ($a1)
+	$LD	$b_1,$BNSZ($a2)
+	$LD	$b_2,2*$BNSZ($a2)
+	$LD	$b_3,3*$BNSZ($a2)
+	mflo	$c_1
+	mfhi	$c_2
+	$ST	$c_1,0($a0)
+
+	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
+	$ADDU	$c_3,$t_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	$ST	$c_2,$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$c_3,$c_2,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$c_1,$c_3,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$c_2,$c_1,$t_2
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	$ST	$c_1,6*$BNSZ($a0)
+	$ST	$c_2,7*$BNSZ($a0)
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	nop
+.end	bn_mul_comba4
+___
+
+($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
+
+$code.=<<___;
+
+.align	5
+.globl	bn_sqr_comba8
+.ent	bn_sqr_comba8
+bn_sqr_comba8:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	$LD	$a_0,0($a1)
+	$LD	$a_1,$BNSZ($a1)
+	$LD	$a_2,2*$BNSZ($a1)
+	$LD	$a_3,3*$BNSZ($a1)
+
+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_4,4*$BNSZ($a1)
+	$LD	$a_5,5*$BNSZ($a1)
+	$LD	$a_6,6*$BNSZ($a1)
+	$LD	$a_7,7*$BNSZ($a1)
+	mflo	$c_1
+	mfhi	$c_2
+	$ST	$c_1,0($a0)
+
+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$c_3,$t_2,$at
+	$ST	$c_2,$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
+	$ADDU	$c_2,$at
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,6*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,7*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,8*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,9*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_1,$at
+	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,10*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_2,$at
+	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,11*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,12*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,13*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	$ST	$c_3,14*$BNSZ($a0)
+	$ST	$c_1,15*$BNSZ($a0)
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	nop
+.end	bn_sqr_comba8
+
+.align	5
+.globl	bn_sqr_comba4
+.ent	bn_sqr_comba4
+bn_sqr_comba4:
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	.frame	$sp,6*$SZREG,$ra
+	.mask	0x8000f008,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,6*$SZREG
+	$REG_S	$ra,5*$SZREG($sp)
+	$REG_S	$t3,4*$SZREG($sp)
+	$REG_S	$t2,3*$SZREG($sp)
+	$REG_S	$t1,2*$SZREG($sp)
+	$REG_S	$t0,1*$SZREG($sp)
+	$REG_S	$gp,0*$SZREG($sp)
+___
+$code.=<<___;
+	.set	reorder
+	$LD	$a_0,0($a1)
+	$LD	$a_1,$BNSZ($a1)
+	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
+	$LD	$a_2,2*$BNSZ($a1)
+	$LD	$a_3,3*$BNSZ($a1)
+	mflo	$c_1
+	mfhi	$c_2
+	$ST	$c_1,0($a0)
+
+	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$c_3,$t_2,$at
+	$ST	$c_2,$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,2*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_3,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$at,$t_2,$zero
+	$ADDU	$c_3,$at
+	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
+	$SLL	$t_2,1
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	sltu	$at,$c_2,$t_2
+	$ADDU	$c_3,$at
+	$ST	$c_1,3*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_1,$t_2,$zero
+	$SLL	$t_2,1
+	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_2,$t_1
+	sltu	$at,$c_2,$t_1
+	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
+	$ADDU	$t_2,$at
+	$ADDU	$c_3,$t_2
+	sltu	$at,$c_3,$t_2
+	$ADDU	$c_1,$at
+	$ST	$c_2,4*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	slt	$c_2,$t_2,$zero
+	$SLL	$t_2,1
+	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
+	slt	$a2,$t_1,$zero
+	$ADDU	$t_2,$a2
+	$SLL	$t_1,1
+	$ADDU	$c_3,$t_1
+	sltu	$at,$c_3,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_1,$t_2
+	sltu	$at,$c_1,$t_2
+	$ADDU	$c_2,$at
+	$ST	$c_3,5*$BNSZ($a0)
+
+	mflo	$t_1
+	mfhi	$t_2
+	$ADDU	$c_1,$t_1
+	sltu	$at,$c_1,$t_1
+	$ADDU	$t_2,$at
+	$ADDU	$c_2,$t_2
+	$ST	$c_1,6*$BNSZ($a0)
+	$ST	$c_2,7*$BNSZ($a0)
+
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$t3,4*$SZREG($sp)
+	$REG_L	$t2,3*$SZREG($sp)
+	$REG_L	$t1,2*$SZREG($sp)
+	$REG_L	$t0,1*$SZREG($sp)
+	$REG_L	$gp,0*$SZREG($sp)
+	$PTR_ADD $sp,6*$SZREG
+___
+$code.=<<___;
+	jr	$ra
+	nop
+.end	bn_sqr_comba4
+___
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 0000000000..54aeb01921
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl
@@ -0,0 +1,1496 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+#   Author: Vinodh.Gopal@intel.com
+#           Jim Guilford
+#           Erdinc.Ozturk@intel.com
+#           Maxim.Perminov@intel.com
+#
+# More information about algorithm used can be found at:
+#   http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+#    software must display the following acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission. For written permission, please contact
+#    licensing@OpenSSL.org.
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+#    nor may "OpenSSL" appear in their names without prior written
+#    permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+#    acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+use strict;
+my $code=".text\n\n";
+my $m=0;
+
+#
+# Define x512 macros
+#
+
+#MULSTEP_512_ADD	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	($ASRC), $X[0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 mov	(+8*$i)($ASRC), $X[$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#MULSTEP_512	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#
+# Swizzle Macros
+#
+
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle	pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+	 mov	\$8, $cnt
+loop_$m:
+	 mov	($pSrc), $d0
+	 mov	$d0#w, ($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*1)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*2)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*3)($pDst)
+	 lea	8($pSrc), $pSrc
+	 lea	64*4($pDst), $pDst
+	 dec	$cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+# macro to copy data from swizzled table to  flat space
+#MACRO unswizzle	pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+	 mov	\$4, $cnt
+loop_$m:
+	 movzxw	(+64*3+256*0)($pSrc), $d0
+	 movzxw	(+64*3+256*1)($pSrc), $d1
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*2+256*0)($pSrc), $d0#w
+	 mov	(+64*2+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*1+256*0)($pSrc), $d0#w
+	 mov	(+64*1+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*0+256*0)($pSrc), $d0#w
+	 mov	(+64*0+256*1)($pSrc), $d1#w
+	 mov	$d0, (+8*0)($pDst)
+	 mov	$d1, (+8*1)($pDst)
+	 lea	256*2($pSrc), $pSrc
+	 lea	8*2($pDst), $pDst
+	 sub	\$1, $cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+#
+# Data Structures
+#
+
+# Reduce Data
+#
+#
+# Offset  Value
+# 0C0     Carries
+# 0B8     X2[10]
+# 0B0     X2[9]
+# 0A8     X2[8]
+# 0A0     X2[7]
+# 098     X2[6]
+# 090     X2[5]
+# 088     X2[4]
+# 080     X2[3]
+# 078     X2[2]
+# 070     X2[1]
+# 068     X2[0]
+# 060     X1[12]  P[10]
+# 058     X1[11]  P[9]  Z[8]
+# 050     X1[10]  P[8]  Z[7]
+# 048     X1[9]   P[7]  Z[6]
+# 040     X1[8]   P[6]  Z[5]
+# 038     X1[7]   P[5]  Z[4]
+# 030     X1[6]   P[4]  Z[3]
+# 028     X1[5]   P[3]  Z[2]
+# 020     X1[4]   P[2]  Z[1]
+# 018     X1[3]   P[1]  Z[0]
+# 010     X1[2]   P[0]  Y[2]
+# 008     X1[1]   Q[1]  Y[1]
+# 000     X1[0]   Q[0]  Y[0]
+
+my $X1_offset           =  0;			# 13 qwords
+my $X2_offset           =  $X1_offset + 13*8;			# 11 qwords
+my $Carries_offset      =  $X2_offset + 11*8;			# 1 qword
+my $Q_offset            =  0;			# 2 qwords
+my $P_offset            =  $Q_offset + 2*8;			# 11 qwords
+my $Y_offset            =  0;			# 3 qwords
+my $Z_offset            =  $Y_offset + 3*8;			# 9 qwords
+
+my $Red_Data_Size       =  $Carries_offset + 1*8;			# (25 qwords)
+
+#
+# Stack Frame
+#
+#
+# offset	value
+# ...		<old stack contents>
+# ...
+# 280		Garray
+
+# 278		tmp16[15]
+# ...		...
+# 200		tmp16[0]
+
+# 1F8		tmp[7]
+# ...		...
+# 1C0		tmp[0]
+
+# 1B8		GT[7]
+# ...		...
+# 180		GT[0]
+
+# 178		Reduce Data
+# ...		...
+# 0B8		Reduce Data
+# 0B0		reserved
+# 0A8		reserved
+# 0A0		reserved
+# 098		reserved
+# 090		reserved
+# 088		reduce result addr
+# 080		exp[8]
+
+# ...
+# 048		exp[1]
+# 040		exp[0]
+
+# 038		reserved
+# 030		loop_idx
+# 028		pg
+# 020		i
+# 018		pData	; arg 4
+# 010		pG	; arg 2
+# 008		pResult	; arg 1
+# 000		rsp	; stack pointer before subtract
+
+my $rsp_offset          =  0;
+my $pResult_offset      =  8*1 + $rsp_offset;
+my $pG_offset           =  8*1 + $pResult_offset;
+my $pData_offset        =  8*1 + $pG_offset;
+my $i_offset            =  8*1 + $pData_offset;
+my $pg_offset           =  8*1 + $i_offset;
+my $loop_idx_offset     =  8*1 + $pg_offset;
+my $reserved1_offset    =  8*1 + $loop_idx_offset;
+my $exp_offset          =  8*1 + $reserved1_offset;
+my $red_result_addr_offset=  8*9 + $exp_offset;
+my $reserved2_offset    =  8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset  =  8*5 + $reserved2_offset;
+my $GT_offset           =  $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset          =  8*8 + $GT_offset;
+my $tmp16_offset        =  8*8 + $tmp_offset;
+my $garray_offset       =  8*16 + $tmp16_offset;
+my $mem_size            =  8*8*32 + $garray_offset;
+
+#
+# Offsets within Reduce Data
+#
+#
+#	struct MODF_2FOLD_MONT_512_C1_DATA {
+#	UINT64 t[8][8];
+#	UINT64 m[8];
+#	UINT64 m1[8]; /* 2^768 % m */
+#	UINT64 m2[8]; /* 2^640 % m */
+#	UINT64 k1[2]; /* (- 1/m) % 2^128 */
+#	};
+
+my $T                   =  0;
+my $M                   =  512;			# = 8 * 8 * 8
+my $M1                  =  576;			# = 8 * 8 * 9 /* += 8 * 8 */
+my $M2                  =  640;			# = 8 * 8 * 10 /* += 8 * 8 */
+my $K1                  =  704;			# = 8 * 8 * 11 /* += 8 * 8 */
+
+#
+#   FUNCTIONS
+#
+
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+#                       and add 512-bits (8 qwords)
+#                       to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+#        512-bit mul source: [rsi+8*n]
+#        512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type	MULADD_128x512,\@abi-omnipotent
+.align	16
+MULADD_128x512:
+___
+	&MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 mov	(+8*1)(%rdi), %rbp
+___
+	&MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 ret
+.size	MULADD_128x512,.-MULADD_128x512
+___
+}}}
+
+{{{
+#MULADD_256x512	MACRO	pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination  (768 bits, 12 qwords)
+#         pA:   Multiplicand (1024 bits, 16 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+	mov	(+8*12)($pA), $OP
+___
+	&MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*13)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*14)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*15)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+}
+
+#
+# mont_reduce(UINT64 *x,  /* 1024 bits, 16 qwords */
+#	       UINT64 *m,  /*  512 bits,  8 qwords */
+#	       MODF_2FOLD_MONT_512_C1_DATA *data,
+#             UINT64 *r)  /*  512 bits,  8 qwords */
+# Input:  x (number to be reduced): tmp16 (Implicit)
+#         m (modulus):              [pM]  (Implicit)
+#         data (reduce data):       [pData] (Implicit)
+# Output: r (result):		     Address in [red_res_addr]
+#         result also in: r9, r8, r15, r14, r13, r12, r11, r10
+
+my @X=map("%r$_",(8..15));
+
+$code.=<<___;
+.type	mont_reduce,\@abi-omnipotent
+.align	16
+mont_reduce:
+___
+
+my $STACK_DEPTH         =  8;
+	#
+	# X1 = Xh * M1 + Xl
+$code.=<<___;
+	 lea	(+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi			# pX1 (Dst) 769 bits, 13 qwords
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rsi			# pM1 (Bsrc) 512 bits, 8 qwords
+	 add	\$$M1, %rsi
+	 lea	(+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx			# X (Asrc) 1024 bits, 16 qwords
+
+___
+
+	&MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X);	# rotates @X 4 times
+	# results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+
+$code.=<<___;
+	 xor	%rax, %rax
+	# X1 += xl
+	 add	(+8*8)(%rcx), $X[4]
+	 adc	(+8*9)(%rcx), $X[5]
+	 adc	(+8*10)(%rcx), $X[6]
+	 adc	(+8*11)(%rcx), $X[7]
+	 adc	\$0, %rax
+	# X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+
+	#
+	# check for carry ;; carry stored in rax
+	 mov	$X[4], (+8*8)(%rdi)			# rdi points to X1
+	 mov	$X[5], (+8*9)(%rdi)
+	 mov	$X[6], %rbp
+	 mov	$X[7], (+8*11)(%rdi)
+
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 mov	(+8*0)(%rdi), $X[4]
+	 mov	(+8*1)(%rdi), $X[5]
+	 mov	(+8*2)(%rdi), $X[6]
+	 mov	(+8*3)(%rdi), $X[7]
+
+	# X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+	# rdi -> X1
+	# rsi -> M1
+
+	#
+	# X2 = Xh * M2 + Xl
+	# do first part (X2 = Xh * M2)
+	 add	\$8*10, %rdi			# rdi -> pXh ; 128 bits, 2 qwords
+				#        Xh is actually { [rdi+8*1], rbp }
+	 add	\$`$M2-$M1`, %rsi			# rsi -> M2
+	 lea	(+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx			# rcx -> pX2 ; 641 bits, 11 qwords
+___
+	unshift(@X,pop(@X));	unshift(@X,pop(@X));
+$code.=<<___;
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+
+	# X2 += Xl
+	 add	(+8*8-8*10)(%rdi), $X[6]		# (-8*10) is to adjust rdi -> Xh to Xl
+	 adc	(+8*9-8*10)(%rdi), $X[7]
+	 mov	$X[6], (+8*8)(%rcx)
+	 mov	$X[7], (+8*9)(%rcx)
+
+	 adc	%rax, %rax
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 lea	(+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi			# rdi -> pQ ; 128 bits, 2 qwords
+	 add	\$`$K1-$M2`, %rsi			# rsi -> pK1 ; 128 bits, 2 qwords
+
+	# MUL_128x128t128	rdi, rcx, rsi	; Q = X2 * K1 (bottom half)
+	# B1:B0 = rsi[1:0] = K1[1:0]
+	# A1:A0 = rcx[1:0] = X2[1:0]
+	# Result = rdi[1],rbp = Q[1],rbp
+	 mov	(%rsi), %r8			# B0
+	 mov	(+8*1)(%rsi), %rbx			# B1
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%r8			# B0
+	 mov	%rax, %rbp
+	 mov	%rdx, %r9
+
+	 mov	(+8*1)(%rcx), %rax			# A1
+	 mul	%r8			# B0
+	 add	%rax, %r9
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%rbx			# B1
+	 add	%rax, %r9
+
+	 mov	%r9, (+8*1)(%rdi)
+	# end MUL_128x128t128
+
+	 sub	\$`$K1-$M`, %rsi
+
+	 mov	(%rcx), $X[6]
+	 mov	(+8*1)(%rcx), $X[7]			# r9:r8 = X2[1:0]
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+
+	# load first half of m to rdx, rdi, rbx, rax
+	# moved this here for efficiency
+	 mov	(+8*0)(%rsi), %rax
+	 mov	(+8*1)(%rsi), %rbx
+	 mov	(+8*2)(%rsi), %rdi
+	 mov	(+8*3)(%rsi), %rdx
+
+	# continue with reduction
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+
+	 add	(+8*8)(%rcx), $X[6]
+	 adc	(+8*9)(%rcx), $X[7]
+
+	#accumulate the final carry to rbp
+	 adc	%rbp, %rbp
+
+	# Add in overflow corrections: R = (X2>>128) += T[overflow]
+	# R = {r9, r8, r15, r14, ..., r10}
+	 shl	\$3, %rbp
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rcx			# rsi -> Data (and points to T)
+	 add	%rcx, %rbp			# pT ; 512 bits, 8 qwords, spread out
+
+	# rsi will be used to generate a mask after the addition
+	 xor	%rsi, %rsi
+
+	 add	(+8*8*0)(%rbp), $X[0]
+	 adc	(+8*8*1)(%rbp), $X[1]
+	 adc	(+8*8*2)(%rbp), $X[2]
+	 adc	(+8*8*3)(%rbp), $X[3]
+	 adc	(+8*8*4)(%rbp), $X[4]
+	 adc	(+8*8*5)(%rbp), $X[5]
+	 adc	(+8*8*6)(%rbp), $X[6]
+	 adc	(+8*8*7)(%rbp), $X[7]
+
+	# if there is a carry:	rsi = 0xFFFFFFFFFFFFFFFF
+	# if carry is clear:	rsi = 0x0000000000000000
+	 sbb	\$0, %rsi
+
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	 mov	\$1, %rbp
+	 sub	%rax, $X[0]
+	 sbb	%rbx, $X[1]
+	 sbb	%rdi, $X[2]
+	 sbb	%rdx, $X[3]
+
+	# if there is a borrow:		rbp = 0
+	# if there is no borrow:	rbp = 1
+	# this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+	 sbb	\$0, %rbp
+
+	#load second half of m to rdx, rdi, rbx, rax
+
+	 add	\$$M, %rcx
+	 mov	(+8*4)(%rcx), %rax
+	 mov	(+8*5)(%rcx), %rbx
+	 mov	(+8*6)(%rcx), %rdi
+	 mov	(+8*7)(%rcx), %rdx
+
+	# use the rsi mask as before
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	# if rbp = 0, there was a borrow before, it is moved to the carry flag
+	# if rbp = 1, there was not a borrow before, carry flag is cleared
+	 sub	\$1, %rbp
+
+	 sbb	%rax, $X[4]
+	 sbb	%rbx, $X[5]
+	 sbb	%rdi, $X[6]
+	 sbb	%rdx, $X[7]
+
+	# write R back to memory
+
+	 mov	(+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+	 mov	$X[0], (+8*0)(%rsi)
+	 mov	$X[1], (+8*1)(%rsi)
+	 mov	$X[2], (+8*2)(%rsi)
+	 mov	$X[3], (+8*3)(%rsi)
+	 mov	$X[4], (+8*4)(%rsi)
+	 mov	$X[5], (+8*5)(%rsi)
+	 mov	$X[6], (+8*6)(%rsi)
+	 mov	$X[7], (+8*7)(%rsi)
+
+	 ret
+.size	mont_reduce,.-mont_reduce
+___
+}}}
+
+{{{
+#MUL_512x512	MACRO	pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination  (1024 bits, 16 qwords)
+#         pA:   Multiplicand (512 bits, 8 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+#   B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+
+$code.=<<___;
+	 mov	(+8*0)($pA), $OP
+
+	 mov	$X[0], %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	$X[$i], %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i-1]
+	 adc	\$0, %rdx
+	 mov	%rdx, $X[$i]
+___
+}
+
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	(+8*$i)($pA), $OP
+___
+
+	&MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+	push(@X,shift(@X));
+}
+
+$code.=<<___;
+	 mov	$X[0], (+$pDst_o+8*8)($pDst)
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+	 mov	$X[3], (+$pDst_o+8*11)($pDst)
+	 mov	$X[4], (+$pDst_o+8*12)($pDst)
+	 mov	$X[5], (+$pDst_o+8*13)($pDst)
+	 mov	$X[6], (+$pDst_o+8*14)($pDst)
+	 mov	$X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input:  src1: Address of source 1: rdi
+#         src2: Address of source 2: rsi
+# Output: dst:  Address of destination: [red_res_addr]
+#    src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp:   Clobbers [tmp16], all registers
+$code.=<<___;
+.type	mont_mul_a3b,\@abi-omnipotent
+.align	16
+mont_mul_a3b:
+	#
+	# multiply tmp = src1 * src2
+	# For multiply: dst = rcx, src1 = rdi, src2 = rsi
+	# stack depth is extra 8 from call
+___
+	&MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+	#
+	# Dst = tmp % m
+	# Call reduce(tmp, m, data, dst)
+
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	# ------------------
+	# first pass 01...07
+	# ------------------
+	 mov	$X[0], $A
+
+	 mov	$X[1],%rax
+	 mul	$A
+	 mov	%rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $X[$i-2]
+	 mov	$X[$i],%rax
+	 mul	$A
+	 add	%rax, $X[$i-2]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $x7
+
+	 mov	$X[0], (+$pDst_o+8*2)($pDst)
+
+	# ------------------
+	# second pass 12...17
+	# ------------------
+
+	 mov	(+8*1)($pA), $A
+
+	 mov	(+8*2)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*3)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*4)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# third pass 23...27
+	# ------------------
+	 mov	(+8*2)($pA), $A
+
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 mov	$X[3], (+$pDst_o+8*5)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+	 mov	$X[4], (+$pDst_o+8*6)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# fourth pass 34...37
+	# ------------------
+
+	 mov	(+8*3)($pA), $A
+
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*7)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+	 mov	$x7, (+$pDst_o+8*8)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[5]
+
+	# ------------------
+	# fifth pass 45...47
+	# ------------------
+	 mov	(+8*4)($pA), $A
+
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# sixth pass 56...57
+	# ------------------
+	 mov	(+8*5)($pA), $A
+
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*11)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*12)($pDst)
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# seventh pass 67
+	# ------------------
+	 mov	$X[6], $A
+
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*13)($pDst)
+
+	 mov	%rdx, (+$pDst_o+8*14)($pDst)
+
+	# start finalize (add	in squares, and double off-terms)
+	 mov	(+$pDst_o+8*1)($pDst), $X[0]
+	 mov	(+$pDst_o+8*2)($pDst), $X[1]
+	 mov	(+$pDst_o+8*3)($pDst), $X[2]
+	 mov	(+$pDst_o+8*4)($pDst), $X[3]
+	 mov	(+$pDst_o+8*5)($pDst), $X[4]
+	 mov	(+$pDst_o+8*6)($pDst), $X[5]
+
+	 mov	(+8*3)($pA), %rax
+	 mul	%rax
+	 mov	%rax, $x6
+	 mov	%rdx, $X[6]
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	\$0, $X[6]
+
+	 mov	(+8*0)($pA), %rax
+	 mul	%rax
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $A
+
+	 mov	(+8*1)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+	 mov	$X[0], (+$pDst_o+8*1)($pDst)
+	 mov	$X[1], (+$pDst_o+8*2)($pDst)
+
+	 mov	(+8*2)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+
+	 mov	$X[2], (+$pDst_o+8*3)($pDst)
+	 mov	$X[3], (+$pDst_o+8*4)($pDst)
+
+	 xor	$tmp, $tmp
+	 add	$A, $X[4]
+	 adc	$x6, $X[5]
+	 adc	\$0, $tmp
+
+	 mov	$X[4], (+$pDst_o+8*5)($pDst)
+	 mov	$X[5], (+$pDst_o+8*6)($pDst)
+
+	# %%tmp has 0/1 in column 7
+	# %%A6 has a full value in column 7
+
+	 mov	(+$pDst_o+8*7)($pDst), $X[0]
+	 mov	(+$pDst_o+8*8)($pDst), $X[1]
+	 mov	(+$pDst_o+8*9)($pDst), $X[2]
+	 mov	(+$pDst_o+8*10)($pDst), $X[3]
+	 mov	(+$pDst_o+8*11)($pDst), $X[4]
+	 mov	(+$pDst_o+8*12)($pDst), $X[5]
+	 mov	(+$pDst_o+8*13)($pDst), $x6
+	 mov	(+$pDst_o+8*14)($pDst), $x7
+
+	 mov	$X[7], %rax
+	 mul	%rax
+	 mov	%rax, $X[7]
+	 mov	%rdx, $A
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	$x6, $x6
+	 adc	$x7, $x7
+	 adc	\$0, $A
+
+	 add	$tmp, $X[0]
+
+	 mov	(+8*4)($pA), %rax
+	 mul	%rax
+
+	 add	$X[6], $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[0], (+$pDst_o+8*7)($pDst)
+	 mov	$X[1], (+$pDst_o+8*8)($pDst)
+
+	 mov	(+8*5)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[2], (+$pDst_o+8*9)($pDst)
+	 mov	$X[3], (+$pDst_o+8*10)($pDst)
+
+	 mov	(+8*6)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[4]
+	 adc	%rax, $X[5]
+	 adc	\$0, %rdx
+
+	 mov	$X[4], (+$pDst_o+8*11)($pDst)
+	 mov	$X[5], (+$pDst_o+8*12)($pDst)
+
+	 add	%rdx, $x6
+	 adc	$X[7], $x7
+	 adc	\$0, $A
+
+	 mov	$x6, (+$pDst_o+8*13)($pDst)
+	 mov	$x7, (+$pDst_o+8*14)($pDst)
+	 mov	$A, (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type	sqr_reduce,\@abi-omnipotent
+.align	16
+sqr_reduce:
+	 mov	(+$pResult_offset+8)(%rsp), %rcx
+___
+	&SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	sqr_reduce,.-sqr_reduce
+___
+}}}
+
+#
+# MAIN FUNCTION
+#
+
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+#           UINT64 *g,   /* 512 bits, 8 qwords */
+#           UINT64 *exp, /* 512 bits, 8 qwords */
+#           struct mod_ctx_512 *data)
+
+# window size = 5
+# table size = 2^5 = 32
+#table_entries	equ	32
+#table_size	equ	table_entries * 8
+$code.=<<___;
+.globl	mod_exp_512
+.type	mod_exp_512,\@function,4
+mod_exp_512:
+	 push	%rbp
+	 push	%rbx
+	 push	%r12
+	 push	%r13
+	 push	%r14
+	 push	%r15
+
+	# adjust stack down and then align it with cache boundary
+	 mov	%rsp, %r8
+	 sub	\$$mem_size, %rsp
+	 and	\$-64, %rsp
+
+	# store previous stack pointer and arguments
+	 mov	%r8, (+$rsp_offset)(%rsp)
+	 mov	%rdi, (+$pResult_offset)(%rsp)
+	 mov	%rsi, (+$pG_offset)(%rsp)
+	 mov	%rcx, (+$pData_offset)(%rsp)
+.Lbody:
+	# transform g into montgomery space
+	# GT = reduce(g * C2) = reduce(g * (2^256))
+	# reduce expects to have the input in [tmp16]
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rsi), %xmm0
+	 movdqu	(+16*1)(%rsi), %xmm1
+	 movdqu	(+16*2)(%rsi), %xmm2
+	 movdqu	(+16*3)(%rsi), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*3)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*5)(%rsp)
+
+	# load pExp before rdx gets blown away
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+
+	 lea	(+$GT_offset)(%rsp), %rbx
+	 mov	%rbx, (+$red_result_addr_offset)(%rsp)
+	 call	mont_reduce
+
+	# Initialize tmp = C
+	 lea	(+$tmp_offset)(%rsp), %rcx
+	 xor	%rax, %rax
+	 mov	%rax, (+8*0)(%rcx)
+	 mov	%rax, (+8*1)(%rcx)
+	 mov	%rax, (+8*3)(%rcx)
+	 mov	%rax, (+8*4)(%rcx)
+	 mov	%rax, (+8*5)(%rcx)
+	 mov	%rax, (+8*6)(%rcx)
+	 mov	%rax, (+8*7)(%rcx)
+	 mov	%rax, (+$exp_offset+8*8)(%rsp)
+	 movq	\$1, (+8*2)(%rcx)
+
+	 lea	(+$garray_offset)(%rsp), %rbp
+	 mov	%rcx, %rsi			# pTmp
+	 mov	%rbp, %rdi			# Garray[][0]
+___
+
+	&swizzle("%rdi", "%rcx", "%rax", "%rbx");
+
+	# for (rax = 31; rax != 0; rax--) {
+	#     tmp = reduce(tmp * G)
+	#     swizzle(pg, tmp);
+	#     pg += 2; }
+$code.=<<___;
+	 mov	\$31, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	# rsi -> pTmp
+	 mov	%rsi, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rsi), %r10
+	 mov	(+8*1)(%rsi), %r11
+	 mov	(+8*2)(%rsi), %r12
+	 mov	(+8*3)(%rsi), %r13
+	 mov	(+8*4)(%rsi), %r14
+	 mov	(+8*5)(%rsi), %r15
+	 mov	(+8*6)(%rsi), %r8
+	 mov	(+8*7)(%rsi), %r9
+init_loop:
+	 lea	(+$GT_offset)(%rsp), %rdi
+	 call	mont_mul_a3b
+	 lea	(+$tmp_offset)(%rsp), %rsi
+	 mov	(+$pg_offset)(%rsp), %rbp
+	 add	\$2, %rbp
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	 mov	%rsi, %rcx			# rcx = rsi = addr of tmp
+___
+
+	&swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+	 mov	(+$i_offset)(%rsp), %rax
+	 sub	\$1, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 jne	init_loop
+
+	#
+	# Copy exponent onto stack
+	 movdqa	%xmm0, (+$exp_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$exp_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$exp_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$exp_offset+16*3)(%rsp)
+
+
+	#
+	# Do exponentiation
+	# Initialize result to G[exp{511:507}]
+	 mov	(+$exp_offset+62)(%rsp), %eax
+	 mov	%rax, %rdx
+	 shr	\$11, %rax
+	 and	\$0x07FF, %edx
+	 mov	%edx, (+$exp_offset+62)(%rsp)
+	 lea	(+$garray_offset)(%rsp,%rax,2), %rsi
+	 mov	(+$pResult_offset)(%rsp), %rdx
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+
+	#
+	# Loop variables
+	# rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+	 movq	\$505, (+$loop_idx_offset)(%rsp)
+
+	 mov	(+$pResult_offset)(%rsp), %rcx
+	 mov	%rcx, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rcx), %r10
+	 mov	(+8*1)(%rcx), %r11
+	 mov	(+8*2)(%rcx), %r12
+	 mov	(+8*3)(%rcx), %r13
+	 mov	(+8*4)(%rcx), %r14
+	 mov	(+8*5)(%rcx), %r15
+	 mov	(+8*6)(%rcx), %r8
+	 mov	(+8*7)(%rcx), %r9
+	 jmp	sqr_2
+
+main_loop_a3b:
+	 call	sqr_reduce
+	 call	sqr_reduce
+	 call	sqr_reduce
+sqr_2:
+	 call	sqr_reduce
+	 call	sqr_reduce
+
+	#
+	# Do multiply, first look up proper value in Garray
+	 mov	(+$loop_idx_offset)(%rsp), %rcx			# bit index
+	 mov	%rcx, %rax
+	 shr	\$4, %rax			# rax is word pointer
+	 mov	(+$exp_offset)(%rsp,%rax,2), %edx
+	 and	\$15, %rcx
+	 shrq	%cl, %rdx
+	 and	\$0x1F, %rdx
+
+	 lea	(+$garray_offset)(%rsp,%rdx,2), %rsi
+	 lea	(+$tmp_offset)(%rsp), %rdx
+	 mov	%rdx, %rdi
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+	# rdi = tmp = pG
+
+	#
+	# Call mod_mul_a1(pDst,  pSrc1, pSrc2, pM, pData)
+	#                 result result pG     M   Data
+$code.=<<___;
+	 mov	(+$pResult_offset)(%rsp), %rsi
+	 call	mont_mul_a3b
+
+	#
+	# finish loop
+	 mov	(+$loop_idx_offset)(%rsp), %rcx
+	 sub	\$5, %rcx
+	 mov	%rcx, (+$loop_idx_offset)(%rsp)
+	 jge	main_loop_a3b
+
+	#
+
+end_main_loop_a3b:
+	# transform result out of Montgomery space
+	# result = reduce(result)
+	 mov	(+$pResult_offset)(%rsp), %rdx
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*5)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*3)(%rsp)
+	 call	mont_reduce
+
+	# If result > m, subract m
+	# load result into r15:r8
+	 mov	(+$pResult_offset)(%rsp), %rax
+	 mov	(+8*0)(%rax), %r8
+	 mov	(+8*1)(%rax), %r9
+	 mov	(+8*2)(%rax), %r10
+	 mov	(+8*3)(%rax), %r11
+	 mov	(+8*4)(%rax), %r12
+	 mov	(+8*5)(%rax), %r13
+	 mov	(+8*6)(%rax), %r14
+	 mov	(+8*7)(%rax), %r15
+
+	# subtract m
+	 mov	(+$pData_offset)(%rsp), %rbx
+	 add	\$$M, %rbx
+
+	 sub	(+8*0)(%rbx), %r8
+	 sbb	(+8*1)(%rbx), %r9
+	 sbb	(+8*2)(%rbx), %r10
+	 sbb	(+8*3)(%rbx), %r11
+	 sbb	(+8*4)(%rbx), %r12
+	 sbb	(+8*5)(%rbx), %r13
+	 sbb	(+8*6)(%rbx), %r14
+	 sbb	(+8*7)(%rbx), %r15
+
+	# if Carry is clear, replace result with difference
+	 mov	(+8*0)(%rax), %rsi
+	 mov	(+8*1)(%rax), %rdi
+	 mov	(+8*2)(%rax), %rcx
+	 mov	(+8*3)(%rax), %rdx
+	 cmovnc	%r8, %rsi
+	 cmovnc	%r9, %rdi
+	 cmovnc	%r10, %rcx
+	 cmovnc	%r11, %rdx
+	 mov	%rsi, (+8*0)(%rax)
+	 mov	%rdi, (+8*1)(%rax)
+	 mov	%rcx, (+8*2)(%rax)
+	 mov	%rdx, (+8*3)(%rax)
+
+	 mov	(+8*4)(%rax), %rsi
+	 mov	(+8*5)(%rax), %rdi
+	 mov	(+8*6)(%rax), %rcx
+	 mov	(+8*7)(%rax), %rdx
+	 cmovnc	%r12, %rsi
+	 cmovnc	%r13, %rdi
+	 cmovnc	%r14, %rcx
+	 cmovnc	%r15, %rdx
+	 mov	%rsi, (+8*4)(%rax)
+	 mov	%rdi, (+8*5)(%rax)
+	 mov	%rcx, (+8*6)(%rax)
+	 mov	%rdx, (+8*7)(%rax)
+
+	 mov	(+$rsp_offset)(%rsp), %rsi
+	 mov	0(%rsi),%r15
+	 mov	8(%rsi),%r14
+	 mov	16(%rsi),%r13
+	 mov	24(%rsi),%r12
+	 mov	32(%rsi),%rbx
+	 mov	40(%rsi),%rbp
+	 lea	48(%rsi),%rsp
+.Lepilogue:
+	 ret
+.size mod_exp_512, . - mod_exp_512
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mod_exp_512_se_handler,\@abi-omnipotent
+.align	16
+mod_exp_512_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	$rsp_offset(%rax),%rax	# pull saved Rsp
+
+	mov	32(%rax),%rbx
+	mov	40(%rax),%rbp
+	mov	24(%rax),%r12
+	mov	16(%rax),%r13
+	mov	8(%rax),%r14
+	mov	0(%rax),%r15
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mod_exp_512_se_handler,.-mod_exp_512_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_mod_exp_512
+	.rva	.LSEH_end_mod_exp_512
+	.rva	.LSEH_info_mod_exp_512
+
+.section	.xdata
+.align	8
+.LSEH_info_mod_exp_512:
+	.byte	9,0,0,0
+	.rva	mod_exp_512_se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl
new file mode 100644
index 0000000000..4a766a87fb
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,993 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$BN_SZ		=$SIZE_T;
+} else {
+	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$BN_SZ		=$SIZE_T;
+	if (open CONF,"<${dir}../../opensslconf.h") {
+	    while(<CONF>) {
+		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+		    $BN_SZ=8;
+		    $LEVEL="2.0";
+		    last;
+		}
+	    }
+	    close CONF;
+	}
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
+				#                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;			# local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";	# passed through stack in 32-bit
+$num="%r21";	# passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4";	$fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
+$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+bn_mul_mont
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	ldo	-$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldw	`-$FRAME_MARKER-4`($fp),$n0
+	ldw	`-$FRAME_MARKER-8`($fp),$num
+	nop
+	nop					; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+	comiclr,<=	6,$num,%r0		; are vectors long enough?
+	b		L\$abort
+	ldi		0,%r28			; signal "unhandled"
+	add,ev		%r0,$num,$num		; is $num even?
+	b		L\$abort
+	nop
+	or		$ap,$np,$ti1
+	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
+	b		L\$abort
+	nop
+	nop					; alignment
+	nop
+
+	fldws		0($n0),${fn0}
+	fldws,ma	4($bp),${fbi}		; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+	comib,>		3,$num,L\$abort		; are vectors long enough?
+	ldi		0,%r28			; signal "unhandled"
+	addl		$num,$num,$num		; I operate on 32-bit values
+
+	fldws		4($n0),${fn0}		; only low part of n0
+	fldws		4($bp),${fbi}		; bp[0] in flipped word order
+___
+$code.=<<___;
+	fldds		0($ap),${fai}		; ap[0,1]
+	fldds		0($np),${fni}		; np[0,1]
+
+	sh2addl		$num,%r0,$arrsz
+	ldi		31,$hi0
+	ldo		36($arrsz),$hi1		; space for tp[num+1]
+	andcm		$hi1,$hi0,$hi1		; align
+	addl		$hi1,%sp,%sp
+	$PUSH		$fp,-$SIZE_T(%sp)
+
+	ldo		`$LOCALS+16`($fp),$xfer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
+	xmpyu		${fn0},${fab0}R,${fm0}
+
+	addl		$arrsz,$ap,$ap		; point at the end
+	addl		$arrsz,$np,$np
+	subi		0,$arrsz,$idx		; j=0
+	ldo		8($idx),$idx		; j++++
+
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	fstds		${fab1},0($xfer)
+	fstds		${fnm1},8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+	mtctl		$hi0,%cr11		; $hi0 still holds 31
+	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
+	b		L\$parisc11
+	nop
+___
+$code.=<<___;					# PA-RISC 2.0 code-path
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+
+	extrd,u		$ab0,31,32,$hi0
+	extrd,u		$ab0,63,32,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 ldo		8($idx),$idx		; j++++
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	 extrd,u	$nm0,31,32,$hi1
+
+L\$1st
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$ab1,$nm1,$nm1
+	 extrd,u	$nm1,31,32,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	 addl		$ab1,$nm1,$nm1
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+	fldws		0($bp),${fbi}		; bp[1] in flipped word order
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	addl		$hi1,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2]
+	 flddx		$idx($np),${fni}	; np[2]
+	 ldo		8($idx),$idx		; j++++
+	ldd		-16($xfer),$ab0		; 33-bit value
+	ldd		-8($xfer),$nm0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	 extrd,u	$ab0,31,32,$ti0		; carry bit
+	 extrd,u	$ab0,63,32,$ab0
+	fstds		${fab1},0($xfer)
+	 addl		$ti0,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 extrd,u	$nm0,31,32,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+L\$inner
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldw		4($tp),$ti0		; tp[j]
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ti0,$ti0
+	 addl		$ti0,$ab0,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,31,32,$hi0
+	 extrd,u	$nm1,31,32,$hi1
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	ldw		4($tp),$ti0		; tp[j]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	addl		$hi0,$ab0,$ab0
+	 addl		$ti0,$ab0,$ab0
+	 stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,31,32,$hi0
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone	; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldi		12,$ti0			; bp[i] in flipped word order
+	addl,ev		%r0,$num,$num
+	ldi		-4,$ti0
+	addl		$ti0,$bp,$bp
+	fldws		0($bp),${fbi}
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0]
+	addl		$hi0,$ab1,$ab1
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	addl		$hi1,$hi0,$hi0
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+	addl		$hi0,$ab1,$ab1
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	addl		$hi1,$hi0,$hi0
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+	ldws,ma		4($tp),$ti0
+	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
+	b		L\$sub_pa11
+	addl		$tp,$arrsz,$tp
+L\$sub
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldd,ma		8($tp),$ti0
+L\$sub
+	ldd		$idx($np),$hi0
+	shrpd		$ti0,$ti0,32,$ti0	; flip word order
+	std		$ti0,-8($tp)		; save flipped value
+	sub,db		$ti0,$hi0,$hi1
+	ldd,ma		8($tp),$ti0
+	addib,<>	8,$idx,L\$sub
+	std,ma		$hi1,8($rp)
+
+	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
+	sub,db		$ti0,%r0,$hi1
+	ldo		-8($tp),$tp
+___
+$code.=<<___;
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy
+	ldd		$idx($np),$hi0
+	std,ma		%r0,8($tp)
+	addib,<>	8,$idx,.-8		; L\$copy
+	std,ma		$hi0,8($rp)	
+___
+
+if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+	b		L\$done
+	nop
+
+	.ALIGN		8
+L\$parisc11
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-12($xfer),$ablo
+	ldw		-16($xfer),$hi0
+	ldw		-4($xfer),$nmlo0
+	ldw		-8($xfer),$nmhi0
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+	 ldo		8($idx),$idx		; j++++
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	 addc		%r0,$nmhi0,$hi1
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+	nop
+
+L\$1st_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		12($xfer),$nmlo1
+	 addc		%r0,$abhi,$hi0
+	ldw		8($xfer),$nmhi1
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	fstds		${fnm1},8($xfer)
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-16($xfer),$abhi
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	ldw		-4($xfer),$nmlo0
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-8($xfer),$nmhi0
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab0},-16($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	fstds		${fnm0},-8($xfer)
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	 ldw		8($xfer),$nmhi1
+	 ldw		12($xfer),$nmlo1
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	 add		$hi0,$ablo,$ablo
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-4($xfer),$nmlo0
+
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	stws,ma		$nmlo0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[1]
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+	ldw		-16($xfer),$abhi	; carry bit actually
+	 ldo		8($idx),$idx		; j++++
+	ldw		-12($xfer),$ablo
+	ldw		-8($xfer),$nmhi0
+	ldw		-4($xfer),$nmlo0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	fstds		${fab1},0($xfer)
+	 addl		$abhi,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 addc		%r0,$nmhi0,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+
+L\$inner_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	ldw		12($xfer),$nmlo1
+	 add		$ti1,$ablo,$ablo
+	ldw		8($xfer),$nmhi1
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab1},0($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	ldw		8($tp),$ti1		; tp[j]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-4($xfer),$nmlo0
+	 add		$hi0,$ablo,$ablo
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$abhi,$abhi
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 add		$ti0,$ablo,$ablo
+	fstds		${fab0},-16($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm0},-8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	ldw		12($xfer),$nmlo1
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldw		8($xfer),$nmhi1
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	fstds		${fab1},0($xfer)
+	 add		$ti1,$ablo,$ablo
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$abhi,$hi0
+	ldw		-16($xfer),$abhi
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-8($xfer),$nmhi0
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-4($xfer),$nmlo0
+	 addc		%r0,$nmhi1,$hi1
+
+	add		$hi0,$ablo,$ablo
+	 stw		$nmlo1,-4($tp)		; tp[j-1]
+	addc		%r0,$abhi,$abhi
+	 add		$ti0,$ablo,$ablo
+	ldw		8($tp),$ti1		; tp[j]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone_pa11; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[i]
+	 flddx		$idx($ap),${fai}	; ap[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer_pa11
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32+4`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+	ldw		-4($tp),$ti0
+	addl		$tp,$arrsz,$tp
+L\$sub_pa11
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub_pa11
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy_pa11
+	ldwx		$idx($np),$hi0
+	stws,ma		%r0,4($tp)
+	addib,<>	4,$idx,L\$copy_pa11
+	stws,ma		$hi0,4($rp)	
+
+	nop					; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+	ldi		1,%r28			; signal "handled"
+	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
+    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+	$opcode|=(1<<10);	# e1
+	$opcode|=(1<<8);	# e2
+	$opcode|=(1<<5);	# d
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	# flip word order in 64-bit mode...
+	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+	# assemble 2.0 instructions in 32-bit mode...
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+	print $_,"\n";
+}
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl
index 7849eae959..f9b6992ccc 100644
--- a/src/lib/libcrypto/bn/asm/ppc-mont.pl
+++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl
@@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
 	$BNSZ=	$BITS/8;
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*16;
 
 	$LD=	"lwz";		# load
 	$LDU=	"lwzu";		# load and update
@@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
 	$BNSZ=	$BITS/8;
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*16;
 
 	# same as above, but 64-bit mnemonics...
 	$LD=	"ld";		# load
@@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
 	$POP=	$LD;
 } else { die "nonsense $flavour"; }
 
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@@ -89,18 +90,18 @@ $aj="r10";
 $nj="r11";
 $tj="r12";
 # non-volatile registers
-$i="r14";
-$j="r15";
-$tp="r16";
-$m0="r17";
-$m1="r18";
-$lo0="r19";
-$hi0="r20";
-$lo1="r21";
-$hi1="r22";
-$alo="r23";
-$ahi="r24";
-$nlo="r25";
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
 #
 $nhi="r0";
 
@@ -108,42 +109,48 @@ $code=<<___;
 .machine "any"
 .text
 
-.globl	.bn_mul_mont
+.globl	.bn_mul_mont_int
 .align	4
-.bn_mul_mont:
+.bn_mul_mont_int:
 	cmpwi	$num,4
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0
 	bltlr
-
+___
+$code.=<<___ if ($BNSZ==4);
+	cmpwi	$num,32		; longer key performance is not better
+	bgelr
+___
+$code.=<<___;
 	slwi	$num,$num,`log($BNSZ)/log(2)`
 	li	$tj,-4096
-	addi	$ovf,$num,`$FRAME+$RZONE`
+	addi	$ovf,$num,$FRAME
 	subf	$ovf,$ovf,$sp	; $sp-$ovf
 	and	$ovf,$ovf,$tj	; minimize TLB usage
 	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	mr	$tj,$sp
 	srwi	$num,$num,`log($BNSZ)/log(2)`
 	$STUX	$sp,$sp,$ovf
 
-	$PUSH	r14,`4*$SIZE_T`($sp)
-	$PUSH	r15,`5*$SIZE_T`($sp)
-	$PUSH	r16,`6*$SIZE_T`($sp)
-	$PUSH	r17,`7*$SIZE_T`($sp)
-	$PUSH	r18,`8*$SIZE_T`($sp)
-	$PUSH	r19,`9*$SIZE_T`($sp)
-	$PUSH	r20,`10*$SIZE_T`($sp)
-	$PUSH	r21,`11*$SIZE_T`($sp)
-	$PUSH	r22,`12*$SIZE_T`($sp)
-	$PUSH	r23,`13*$SIZE_T`($sp)
-	$PUSH	r24,`14*$SIZE_T`($sp)
-	$PUSH	r25,`15*$SIZE_T`($sp)
+	$PUSH	r20,`-12*$SIZE_T`($tj)
+	$PUSH	r21,`-11*$SIZE_T`($tj)
+	$PUSH	r22,`-10*$SIZE_T`($tj)
+	$PUSH	r23,`-9*$SIZE_T`($tj)
+	$PUSH	r24,`-8*$SIZE_T`($tj)
+	$PUSH	r25,`-7*$SIZE_T`($tj)
+	$PUSH	r26,`-6*$SIZE_T`($tj)
+	$PUSH	r27,`-5*$SIZE_T`($tj)
+	$PUSH	r28,`-4*$SIZE_T`($tj)
+	$PUSH	r29,`-3*$SIZE_T`($tj)
+	$PUSH	r30,`-2*$SIZE_T`($tj)
+	$PUSH	r31,`-1*$SIZE_T`($tj)
 
 	$LD	$n0,0($n0)	; pull n0[0] value
 	addi	$num,$num,-2	; adjust $num for counter register
 
 	$LD	$m0,0($bp)	; m0=bp[0]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
 	$UMULH	$hi0,$aj,$m0
 
@@ -205,8 +212,8 @@ L1st:
 Louter:
 	$LDX	$m0,$bp,$i	; m0=bp[i]
 	$LD	$aj,0($ap)	; ap[0]
-	addi	$tp,$sp,$FRAME
-	$LD	$tj,$FRAME($sp)	; tp[0]
+	addi	$tp,$sp,$LOCALS
+	$LD	$tj,$LOCALS($sp); tp[0]
 	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
 	$UMULH	$hi0,$aj,$m0
 	$LD	$aj,$BNSZ($ap)	; ap[1]
@@ -273,7 +280,7 @@ Linner:
 
 	addi	$num,$num,2	; restore $num
 	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
-	addi	$tp,$sp,$FRAME
+	addi	$tp,$sp,$LOCALS
 	mtctr	$num
 
 .align	4
@@ -299,23 +306,27 @@ Lcopy:				; copy or in-place refresh
 	addi	$j,$j,$BNSZ
 	bdnz-	Lcopy
 
-	$POP	r14,`4*$SIZE_T`($sp)
-	$POP	r15,`5*$SIZE_T`($sp)
-	$POP	r16,`6*$SIZE_T`($sp)
-	$POP	r17,`7*$SIZE_T`($sp)
-	$POP	r18,`8*$SIZE_T`($sp)
-	$POP	r19,`9*$SIZE_T`($sp)
-	$POP	r20,`10*$SIZE_T`($sp)
-	$POP	r21,`11*$SIZE_T`($sp)
-	$POP	r22,`12*$SIZE_T`($sp)
-	$POP	r23,`13*$SIZE_T`($sp)
-	$POP	r24,`14*$SIZE_T`($sp)
-	$POP	r25,`15*$SIZE_T`($sp)
-	$POP	$sp,0($sp)
+	$POP	$tj,0($sp)
 	li	r3,1
+	$POP	r20,`-12*$SIZE_T`($tj)
+	$POP	r21,`-11*$SIZE_T`($tj)
+	$POP	r22,`-10*$SIZE_T`($tj)
+	$POP	r23,`-9*$SIZE_T`($tj)
+	$POP	r24,`-8*$SIZE_T`($tj)
+	$POP	r25,`-7*$SIZE_T`($tj)
+	$POP	r26,`-6*$SIZE_T`($tj)
+	$POP	r27,`-5*$SIZE_T`($tj)
+	$POP	r28,`-4*$SIZE_T`($tj)
+	$POP	r29,`-3*$SIZE_T`($tj)
+	$POP	r30,`-2*$SIZE_T`($tj)
+	$POP	r31,`-1*$SIZE_T`($tj)
+	mr	$sp,$tj
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x80,12,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl
index f4093177e6..1249ce2299 100644
--- a/src/lib/libcrypto/bn/asm/ppc.pl
+++ b/src/lib/libcrypto/bn/asm/ppc.pl
@@ -389,7 +389,9 @@ $data=<<EOF;
 	$ST		r9,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST		r10,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -814,8 +816,9 @@ $data=<<EOF;
 
 
 	blr
-
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -966,7 +969,9 @@ $data=<<EOF;
 	$ST	r10,`6*$BNSZ`(r3)	#r[6]=c1
 	$ST	r11,`7*$BNSZ`(r3)	#r[7]=c2
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1502,7 +1507,9 @@ $data=<<EOF;
 	$ST	r12,`14*$BNSZ`(r3)	#r[14]=c3;
 	$ST	r10,`15*$BNSZ`(r3)	#r[15]=c1;
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
 	subfze	r3,r0		# if carry bit is set then r3 = 0 else -1
 	andi.	r3,r3,1         # keep only last bit.
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
 Lppcasm_add_adios:	
 	addze	r3,r0			#return carry bit.
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1707,7 +1717,9 @@ Lppcasm_div8:
 Lppcasm_div9:
 	or	r3,r8,r0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
 	bdnz-	Lppcasm_sqr_mainloop
 Lppcasm_sqr_adios:	
 	blr
-	.long	0x00000000
-
+	.long	0
+	.byte	0,12,0x14,0,0,0,3,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
 Lppcasm_mw_OVER:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 
 #
 #	NOTE:	The following label name should be changed to
@@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
 Lppcasm_maw_adios:	
 	addi	r3,r12,0
 	blr
-	.long	0x00000000
+	.long	0
+	.byte	0,12,0x14,0,0,0,4,0
+	.long	0
 	.align	4
 EOF
 $data =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/bn/asm/ppc64-mont.pl b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
index 3449b35855..a14e769ad0 100644
--- a/src/lib/libcrypto/bn/asm/ppc64-mont.pl
+++ b/src/lib/libcrypto/bn/asm/ppc64-mont.pl
@@ -45,23 +45,40 @@
 # on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
 # in absolute terms, but it's apparently the way Power 6 is...
 
+# December 2009
+
+# Adapted for 32-bit build this module delivers 25-120%, yes, more
+# than *twice* for longer keys, performance improvement over 32-bit
+# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
+# even 64-bit integer operations and the trouble is that most PPC
+# operating systems don't preserve upper halves of general purpose
+# registers upon 32-bit signal delivery. They do preserve them upon
+# context switch, but not signalling:-( This means that asynchronous
+# signals have to be blocked upon entry to this subroutine. Signal
+# masking (and of course complementary unmasking) has quite an impact
+# on performance, naturally larger for shorter keys. It's so severe
+# that 512-bit key performance can be as low as 1/3 of expected one.
+# This is why this routine can be engaged for longer key operations
+# only on these OSes, see crypto/ppccap.c for further details. MacOS X
+# is an exception from this and doesn't require signal masking, and
+# that's where above improvement coefficients were collected. For
+# others alternative would be to break dependence on upper halves of
+# GPRs by sticking to 32-bit integer operations...
+
 $flavour = shift;
 
 if ($flavour =~ /32/) {
 	$SIZE_T=4;
 	$RZONE=	224;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont_ppc64";
+	$fname=	"bn_mul_mont_fpu64";
 
 	$STUX=	"stwux";	# store indexed and update
 	$PUSH=	"stw";
 	$POP=	"lwz";
-	die "not implemented yet";
 } elsif ($flavour =~ /64/) {
 	$SIZE_T=8;
 	$RZONE=	288;
-	$FRAME=	$SIZE_T*12+8*12;
-	$fname=	"bn_mul_mont";
+	$fname=	"bn_mul_mont_fpu64";
 
 	# same as above, but 64-bit mnemonics...
 	$STUX=	"stdux";	# store indexed and update
@@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=($FRAME+63)&~63;
+$FRAME=64;	# padded frame header
 $TRANSFER=16*8;
 
 $carry="r0";
@@ -93,16 +110,16 @@ $tp="r10";
 $j="r11";
 $i="r12";
 # non-volatile registers
-$nap_d="r14";	# interleaved ap and np in double format
-$a0="r15";	# ap[0]
-$t0="r16";	# temporary registers
-$t1="r17";
-$t2="r18";
-$t3="r19";
-$t4="r20";
-$t5="r21";
-$t6="r22";
-$t7="r23";
+$nap_d="r22";	# interleaved ap and np in double format
+$a0="r23";	# ap[0]
+$t0="r24";	# temporary registers
+$t1="r25";
+$t2="r26";
+$t3="r27";
+$t4="r28";
+$t5="r29";
+$t6="r30";
+$t7="r31";
 
 # PPC offers enough register bank capacity to unroll inner loops twice
 #
@@ -132,28 +149,17 @@ $ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
 $na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
 $dota="f8";	$dotb="f9";
 $A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
-$N0="f14";	$N1="f15";	$N2="f16";	$N3="f17";
-$T0a="f18";	$T0b="f19";
-$T1a="f20";	$T1b="f21";
-$T2a="f22";	$T2b="f23";
-$T3a="f24";	$T3b="f25";
+$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
+$T0a="f24";	$T0b="f25";
+$T1a="f26";	$T1b="f27";
+$T2a="f28";	$T2b="f29";
+$T3a="f30";	$T3b="f31";
 
 # sp----------->+-------------------------------+
 #		| saved sp			|
 #		+-------------------------------+
-#		|				|
-#		+-------------------------------+
-#		| 10 saved gpr, r14-r23		|
-#		.				.
-#		.				.
-#   +12*size_t	+-------------------------------+
-#		| 12 saved fpr, f14-f25		|
 #		.				.
-#		.				.
-#   +12*8	+-------------------------------+
-#		| padding to 64 byte boundary	|
-#		.				.
-#   +X		+-------------------------------+
+#   +64		+-------------------------------+
 #		| 16 gpr<->fpr transfer zone	|
 #		.				.
 #		.				.
@@ -173,6 +179,16 @@ $T3a="f24";	$T3b="f25";
 #		.				.
 #		.				.
 #		+-------------------------------+
+#		.				.
+#   -12*size_t	+-------------------------------+
+#		| 10 saved gpr, r22-r31		|
+#		.				.
+#		.				.
+#   -12*8	+-------------------------------+
+#		| 12 saved fpr, f20-f31		|
+#		.				.
+#		.				.
+#		+-------------------------------+
 
 $code=<<___;
 .machine "any"
@@ -181,14 +197,14 @@ $code=<<___;
 .globl	.$fname
 .align	5
 .$fname:
-	cmpwi	$num,4
+	cmpwi	$num,`3*8/$SIZE_T`
 	mr	$rp,r3		; $rp is reassigned
 	li	r3,0		; possible "not handled" return code
 	bltlr-
-	andi.	r0,$num,1	; $num has to be even
+	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
 	bnelr-
 
-	slwi	$num,$num,3	; num*=8
+	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
 	li	$i,-4096
 	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
 	add	$tp,$tp,$num	; place for tp[num+1]
@@ -196,35 +212,50 @@ $code=<<___;
 	subf	$tp,$tp,$sp	; $sp-$tp
 	and	$tp,$tp,$i	; minimize TLB usage
 	subf	$tp,$sp,$tp	; $tp-$sp
+	mr	$i,$sp
 	$STUX	$sp,$sp,$tp	; alloca
 
-	$PUSH	r14,`2*$SIZE_T`($sp)
-	$PUSH	r15,`3*$SIZE_T`($sp)
-	$PUSH	r16,`4*$SIZE_T`($sp)
-	$PUSH	r17,`5*$SIZE_T`($sp)
-	$PUSH	r18,`6*$SIZE_T`($sp)
-	$PUSH	r19,`7*$SIZE_T`($sp)
-	$PUSH	r20,`8*$SIZE_T`($sp)
-	$PUSH	r21,`9*$SIZE_T`($sp)
-	$PUSH	r22,`10*$SIZE_T`($sp)
-	$PUSH	r23,`11*$SIZE_T`($sp)
-	stfd	f14,`12*$SIZE_T+0`($sp)
-	stfd	f15,`12*$SIZE_T+8`($sp)
-	stfd	f16,`12*$SIZE_T+16`($sp)
-	stfd	f17,`12*$SIZE_T+24`($sp)
-	stfd	f18,`12*$SIZE_T+32`($sp)
-	stfd	f19,`12*$SIZE_T+40`($sp)
-	stfd	f20,`12*$SIZE_T+48`($sp)
-	stfd	f21,`12*$SIZE_T+56`($sp)
-	stfd	f22,`12*$SIZE_T+64`($sp)
-	stfd	f23,`12*$SIZE_T+72`($sp)
-	stfd	f24,`12*$SIZE_T+80`($sp)
-	stfd	f25,`12*$SIZE_T+88`($sp)
-
+	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
+	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
+	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
+	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
+	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
+	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
+	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
+	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
+	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
+	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
+	stfd	f20,`-12*8`($i)
+	stfd	f21,`-11*8`($i)
+	stfd	f22,`-10*8`($i)
+	stfd	f23,`-9*8`($i)
+	stfd	f24,`-8*8`($i)
+	stfd	f25,`-7*8`($i)
+	stfd	f26,`-6*8`($i)
+	stfd	f27,`-5*8`($i)
+	stfd	f28,`-4*8`($i)
+	stfd	f29,`-3*8`($i)
+	stfd	f30,`-2*8`($i)
+	stfd	f31,`-1*8`($i)
+___
+$code.=<<___ if ($SIZE_T==8);
 	ld	$a0,0($ap)	; pull ap[0] value
 	ld	$n0,0($n0)	; pull n0[0] value
 	ld	$t3,0($bp)	; bp[0]
-
+___
+$code.=<<___ if ($SIZE_T==4);
+	mr	$t1,$n0
+	lwz	$a0,0($ap)	; pull ap[0,1] value
+	lwz	$t0,4($ap)
+	lwz	$n0,0($t1)	; pull n0[0,1] value
+	lwz	$t1,4($t1)
+	lwz	$t3,0($bp)	; bp[0,1]
+	lwz	$t2,4($bp)
+	insrdi	$a0,$t0,32,0
+	insrdi	$n0,$t1,32,0
+	insrdi	$t3,$t2,32,0
+___
+$code.=<<___;
 	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
 	li	$i,-64
 	add	$nap_d,$tp,$num
@@ -258,6 +289,8 @@ $code=<<___;
 	std	$t5,`$FRAME+40`($sp)
 	std	$t6,`$FRAME+48`($sp)
 	std	$t7,`$FRAME+56`($sp)
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -266,6 +299,18 @@ $code=<<___;
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	lfd	$ba,`$FRAME+0`($sp)
 	lfd	$bb,`$FRAME+8`($sp)
 	lfd	$bc,`$FRAME+16`($sp)
@@ -374,6 +419,8 @@ $code=<<___;
 
 .align	5
 L1st:
+___
+$code.=<<___ if ($SIZE_T==8);
 	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
 	lwz	$t1,0($ap)
 	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
@@ -382,6 +429,18 @@ L1st:
 	lwz	$t5,0($np)
 	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
 	lwz	$t7,8($np)
+___
+$code.=<<___ if ($SIZE_T==4);
+	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
+	lwz	$t1,4($ap)
+	lwz	$t2,8($ap)
+	lwz	$t3,12($ap)
+	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
+	lwz	$t5,4($np)
+	lwz	$t6,8($np)
+	lwz	$t7,12($np)
+___
+$code.=<<___;
 	std	$t0,`$FRAME+64`($sp)
 	std	$t1,`$FRAME+72`($sp)
 	std	$t2,`$FRAME+80`($sp)
@@ -559,7 +618,17 @@ L1st:
 	li	$i,8			; i=1
 .align	5
 Louter:
+___
+$code.=<<___ if ($SIZE_T==8);
 	ldx	$t3,$bp,$i	; bp[i]
+___
+$code.=<<___ if ($SIZE_T==4);
+	add	$t0,$bp,$i
+	lwz	$t3,0($t0)		; bp[i,i+1]
+	lwz	$t0,4($t0)
+	insrdi	$t3,$t0,32,0
+___
+$code.=<<___;
 	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
 	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
 
@@ -761,6 +830,13 @@ Linner:
 	stfd	$T0b,`$FRAME+8`($sp)
 	 add	$t7,$t7,$carry
 	 addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	stfd	$T1a,`$FRAME+16`($sp)
 	stfd	$T1b,`$FRAME+24`($sp)
 	 insrdi	$t4,$t7,16,0		; 64..127 bits
@@ -768,6 +844,13 @@ Linner:
 	stfd	$T2a,`$FRAME+32`($sp)
 	stfd	$T2b,`$FRAME+40`($sp)
 	 adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	stfd	$T3a,`$FRAME+48`($sp)
 	stfd	$T3b,`$FRAME+56`($sp)
 	 addze	$carry,$carry
@@ -816,7 +899,21 @@ Linner:
 	ld	$t7,`$FRAME+72`($sp)
 
 	addc	$t3,$t0,$t1
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t0,$t0,32,0
+	extrdi	$t1,$t1,32,0
+	adde	$t0,$t0,$t1
+___
+$code.=<<___;
 	adde	$t5,$t4,$t2
+___
+$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
+	extrdi	$t4,$t4,32,0
+	extrdi	$t2,$t2,32,0
+	adde	$t4,$t4,$t2
+___
+$code.=<<___;
 	addze	$carry,$carry
 
 	std	$t3,-16($tp)		; tp[j-1]
@@ -835,7 +932,9 @@ Linner:
 	subf	$nap_d,$t7,$nap_d	; rewind pointer
 	cmpw	$i,$num
 	blt-	Louter
+___
 
+$code.=<<___ if ($SIZE_T==8);
 	subf	$np,$num,$np	; rewind np
 	addi	$j,$j,1		; restore counter
 	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
@@ -883,34 +982,105 @@ Lcopy:				; copy or in-place refresh
 	stdx	$i,$t4,$i
 	addi	$i,$i,16
 	bdnz-	Lcopy
+___
+$code.=<<___ if ($SIZE_T==4);
+	subf	$np,$num,$np	; rewind np
+	addi	$j,$j,1		; restore counter
+	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	addi	$np,$np,-4
+	addi	$rp,$rp,-4
+	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
+	mtctr	$j
+
+.align	4
+Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
+	ldu	$t2,16($tp)
+	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
+	lwz	$t5,8($np)
+	lwz	$t6,12($np)
+	lwzu	$t7,16($np)
+	extrdi	$t1,$t0,32,0
+	extrdi	$t3,$t2,32,0
+	subfe	$t4,$t4,$t0	; tp[j]-np[j]
+	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
+	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
+	 stw	$t1,8($ap)
+	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
+	 stw	$t2,12($ap)
+	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
+	 stwu	$t3,16($ap)
+	stw	$t4,4($rp)
+	stw	$t5,8($rp)
+	stw	$t6,12($rp)
+	stwu	$t7,16($rp)
+	bdnz-	Lsub
+
+	li	$i,0
+	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
+	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
+	subf	$rp,$num,$rp	; rewind rp
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+	addi	$tp,$sp,`$FRAME+$TRANSFER`
+	mtctr	$j
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	lwz	$t0,4($ap)
+	lwz	$t1,8($ap)
+	lwz	$t2,12($ap)
+	lwzu	$t3,16($ap)
+	std	$i,8($nap_d)	; zap nap_d
+	std	$i,16($nap_d)
+	std	$i,24($nap_d)
+	std	$i,32($nap_d)
+	std	$i,40($nap_d)
+	std	$i,48($nap_d)
+	std	$i,56($nap_d)
+	stdu	$i,64($nap_d)
+	stw	$t0,4($rp)
+	stw	$t1,8($rp)
+	stw	$t2,12($rp)
+	stwu	$t3,16($rp)
+	std	$i,8($tp)	; zap tp at once
+	stdu	$i,16($tp)
+	bdnz-	Lcopy
+___
 
-	$POP	r14,`2*$SIZE_T`($sp)
-	$POP	r15,`3*$SIZE_T`($sp)
-	$POP	r16,`4*$SIZE_T`($sp)
-	$POP	r17,`5*$SIZE_T`($sp)
-	$POP	r18,`6*$SIZE_T`($sp)
-	$POP	r19,`7*$SIZE_T`($sp)
-	$POP	r20,`8*$SIZE_T`($sp)
-	$POP	r21,`9*$SIZE_T`($sp)
-	$POP	r22,`10*$SIZE_T`($sp)
-	$POP	r23,`11*$SIZE_T`($sp)
-	lfd	f14,`12*$SIZE_T+0`($sp)
-	lfd	f15,`12*$SIZE_T+8`($sp)
-	lfd	f16,`12*$SIZE_T+16`($sp)
-	lfd	f17,`12*$SIZE_T+24`($sp)
-	lfd	f18,`12*$SIZE_T+32`($sp)
-	lfd	f19,`12*$SIZE_T+40`($sp)
-	lfd	f20,`12*$SIZE_T+48`($sp)
-	lfd	f21,`12*$SIZE_T+56`($sp)
-	lfd	f22,`12*$SIZE_T+64`($sp)
-	lfd	f23,`12*$SIZE_T+72`($sp)
-	lfd	f24,`12*$SIZE_T+80`($sp)
-	lfd	f25,`12*$SIZE_T+88`($sp)
-	$POP	$sp,0($sp)
+$code.=<<___;
+	$POP	$i,0($sp)
 	li	r3,1	; signal "handled"
+	$POP	r22,`-12*8-10*$SIZE_T`($i)
+	$POP	r23,`-12*8-9*$SIZE_T`($i)
+	$POP	r24,`-12*8-8*$SIZE_T`($i)
+	$POP	r25,`-12*8-7*$SIZE_T`($i)
+	$POP	r26,`-12*8-6*$SIZE_T`($i)
+	$POP	r27,`-12*8-5*$SIZE_T`($i)
+	$POP	r28,`-12*8-4*$SIZE_T`($i)
+	$POP	r29,`-12*8-3*$SIZE_T`($i)
+	$POP	r30,`-12*8-2*$SIZE_T`($i)
+	$POP	r31,`-12*8-1*$SIZE_T`($i)
+	lfd	f20,`-12*8`($i)
+	lfd	f21,`-11*8`($i)
+	lfd	f22,`-10*8`($i)
+	lfd	f23,`-9*8`($i)
+	lfd	f24,`-8*8`($i)
+	lfd	f25,`-7*8`($i)
+	lfd	f26,`-6*8`($i)
+	lfd	f27,`-5*8`($i)
+	lfd	f28,`-4*8`($i)
+	lfd	f29,`-3*8`($i)
+	lfd	f30,`-2*8`($i)
+	lfd	f31,`-1*8`($i)
+	mr	$sp,$i
 	blr
 	.long	0
-.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
+	.byte	0,12,4,0,0x8c,10,6,0
+	.long	0
+
+.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
new file mode 100644
index 0000000000..cd9f13eca2
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*)	gcc 4.1 was observed to deliver better results than gcc 4.3,
+#	so that improvement coefficients can vary from one specific
+#	setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@function
+.align	16
+_mul_1x1:
+	lgr	$a1,$a
+	sllg	$a2,$a,1
+	sllg	$a4,$a,2
+	sllg	$a8,$a,3
+
+	srag	$lo,$a1,63			# broadcast 63rd bit
+	nihh	$a1,0x1fff
+	srag	@i[0],$a2,63			# broadcast 62nd bit
+	nihh	$a2,0x3fff
+	srag	@i[1],$a4,63			# broadcast 61st bit
+	nihh	$a4,0x7fff
+	ngr	$lo,$b
+	ngr	@i[0],$b
+	ngr	@i[1],$b
+
+	lghi	@T[0],0
+	lgr	$a12,$a1
+	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
+	xgr	$a12,$a2
+	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
+	 lgr	$a48,$a4
+	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
+	 xgr	$a48,$a8
+	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
+	 xgr	$a1,$a4
+
+	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
+	xgr	$a2,$a4
+	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
+	xgr	$a12,$a4
+	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
+	 xgr	$a1,$a48
+	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
+	 xgr	$a2,$a48
+
+	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
+	xgr	$a12,$a48
+	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
+	 xgr	$a1,$a4
+	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
+	 xgr	$a2,$a4
+	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8
+
+	xgr	$a12,$a4
+	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
+	 srlg	$hi,$lo,1
+	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
+	 sllg	$lo,$lo,63
+	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
+	 srlg	@T[0],@i[0],2
+	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8
+
+	lghi	$mask,`0xf<<3`
+	sllg	$a1,@i[0],62
+	 sllg	@i[0],$b,3
+	srlg	@T[1],@i[1],3
+	 ngr	@i[0],$mask
+	sllg	$a2,@i[1],61
+	 srlg	@i[1],$b,4-3
+	xgr	$hi,@T[0]
+	 ngr	@i[1],$mask
+	xgr	$lo,$a1
+	xgr	$hi,@T[1]
+	xgr	$lo,$a2
+
+	xg	$lo,$stdframe(@i[0],$sp)
+	srlg	@i[0],$b,8-3
+	ngr	@i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	srlg	@i[1],$b,`($n+2)*4`-3
+	sllg	@T[0],@T[1],`$n*4`
+	ngr	@i[1],$mask
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	sllg	@T[0],@T[1],`$n*4`
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+
+	lg	@T[0],$stdframe(@i[0],$sp)
+	sllg	@T[1],@T[0],`($n+1)*4`
+	srlg	@T[0],@T[0],`64-($n+1)*4`
+	xgr	$lo,@T[1]
+	xgr	$hi,@T[0]
+
+	br	$ra
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@function
+.align	16
+bn_GF2m_mul_2x2:
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	%r1,-$stdframe-128
+	la	%r0,0($sp)
+	la	$sp,0(%r1,$sp)			# alloca
+	st${g}	%r0,0($sp)			# back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+	bras	$ra,_mul_1x1			# a1�b1
+	stmg	$lo,$hi,16($rp)
+
+	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# a0�b0
+	stmg	$lo,$hi,0($rp)
+
+	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
+	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# (a0+a1)�(b0+b1)
+	lmg	@r[0],@r[3],0($rp)
+
+	xgr	$lo,$hi
+	xgr	$hi,@r[1]
+	xgr	$lo,@r[0]
+	xgr	$hi,@r[2]
+	xgr	$lo,@r[3]	
+	xgr	$hi,@r[3]
+	xgr	$lo,$hi
+	stg	$hi,16($rp)
+	stg	$lo,8($rp)
+___
+} else {
+$code.=<<___;
+	sllg	%r3,%r3,32
+	sllg	%r5,%r5,32
+	or	%r3,%r4
+	or	%r5,%r6
+	bras	$ra,_mul_1x1
+	rllg	$lo,$lo,32
+	rllg	$hi,$hi,32
+	stmg	$lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+	br	$ra
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl
index f61246f5b6..9fd64e81ee 100644
--- a/src/lib/libcrypto/bn/asm/s390x-mont.pl
+++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl
@@ -32,6 +32,33 @@
 # Reschedule to minimize/avoid Address Generation Interlock hazard,
 # make inner loops counter-based.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
 $mn0="%r0";
 $num="%r1";
 
@@ -60,34 +87,44 @@ $code.=<<___;
 .globl	bn_mul_mont
 .type	bn_mul_mont,\@function
 bn_mul_mont:
-	lgf	$num,164($sp)	# pull $num
-	sla	$num,3		# $num to enumerate bytes
+	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
+	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
 	la	$bp,0($num,$bp)
 
-	stg	%r2,16($sp)
+	st${g}	%r2,2*$SIZE_T($sp)
 
 	cghi	$num,16		#
 	lghi	%r2,0		#
 	blr	%r14		# if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	tmll	$num,4
+	bnzr	%r14		# if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
 	cghi	$num,96		#
 	bhr	%r14		# if($num>96) return 0;
+___
+$code.=<<___;
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
 
-	stmg	%r3,%r15,24($sp)
-
-	lghi	$rp,-160-8	# leave room for carry bit
+	lghi	$rp,-$stdframe-8	# leave room for carry bit
 	lcgr	$j,$num		# -$num
 	lgr	%r0,$sp
 	la	$rp,0($rp,$sp)
 	la	$sp,0($j,$rp)	# alloca
-	stg	%r0,0($sp)	# back chain
+	st${g}	%r0,0($sp)	# back chain
 
 	sra	$num,3		# restore $num
 	la	$bp,0($j,$bp)	# restore $bp
 	ahi	$num,-1		# adjust $num for inner loop
 	lg	$n0,0($n0)	# pull n0
+	_dswap	$n0
 
 	lg	$bi,0($bp)
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[0]
 	lgr	$AHI,$ahi
 
@@ -95,6 +132,7 @@ bn_mul_mont:
 	msgr	$mn0,$n0
 
 	lg	$nlo,0($np)	#
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -106,12 +144,14 @@ bn_mul_mont:
 .align	16
 .L1st:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[0]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -119,22 +159,24 @@ bn_mul_mont:
 	algr	$nlo,$alo
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.L1st
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI	# upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 	la	$bp,8($bp)	# bp++
 
 .Louter:
 	lg	$bi,0($bp)	# bp[i]
+	_dswap	$bi
 	lg	$alo,0($ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[0]*bp[i]
-	alg	$alo,160($sp)	# +=tp[0]
+	alg	$alo,$stdframe($sp)	# +=tp[0]
 	lghi	$AHI,0
 	alcgr	$AHI,$ahi
 
@@ -142,6 +184,7 @@ bn_mul_mont:
 	msgr	$mn0,$n0	# tp[0]*n0
 
 	lg	$nlo,0($np)	# np[0]
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[0]*m1
 	algr	$nlo,$alo	# +="tp[0]"
 	lghi	$NHI,0
@@ -153,14 +196,16 @@ bn_mul_mont:
 .align	16
 .Linner:
 	lg	$alo,0($j,$ap)
+	_dswap	$alo
 	mlgr	$ahi,$bi	# ap[j]*bp[i]
 	algr	$alo,$AHI
 	lghi	$AHI,0
 	alcgr	$ahi,$AHI
-	alg	$alo,160($j,$sp)# +=tp[j]
+	alg	$alo,$stdframe($j,$sp)# +=tp[j]
 	alcgr	$AHI,$ahi
 
 	lg	$nlo,0($j,$np)
+	_dswap	$nlo
 	mlgr	$nhi,$mn0	# np[j]*m1
 	algr	$nlo,$NHI
 	lghi	$NHI,0
@@ -168,31 +213,33 @@ bn_mul_mont:
 	algr	$nlo,$alo	# +="tp[j]"
 	alcgr	$NHI,$nhi
 
-	stg	$nlo,160-8($j,$sp)	# tp[j-1]=
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
 	la	$j,8($j)	# j++
 	brct	$count,.Linner
 
 	algr	$NHI,$AHI
 	lghi	$AHI,0
 	alcgr	$AHI,$AHI
-	alg	$NHI,160($j,$sp)# accumulate previous upmost overflow bit
+	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
 	lghi	$ahi,0
 	alcgr	$AHI,$ahi	# new upmost overflow bit
-	stg	$NHI,160-8($j,$sp)
-	stg	$AHI,160($j,$sp)
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
 
 	la	$bp,8($bp)	# bp++
-	clg	$bp,160+8+32($j,$sp)	# compare to &bp[num]
+	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
 	jne	.Louter
 
-	lg	$rp,160+8+16($j,$sp)	# reincarnate rp
-	la	$ap,160($sp)
+	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
+	la	$ap,$stdframe($sp)
 	ahi	$num,1		# restore $num, incidentally clears "borrow"
 
 	la	$j,0(%r0)
 	lr	$count,$num
 .Lsub:	lg	$alo,0($j,$ap)
-	slbg	$alo,0($j,$np)
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	slbgr	$alo,$nlo
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lsub
@@ -207,19 +254,24 @@ bn_mul_mont:
 
 	la	$j,0(%r0)
 	lgr	$count,$num
-.Lcopy:	lg	$alo,0($j,$ap)	# copy or in-place refresh
-	stg	$j,160($j,$sp)	# zap tp
+.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
+	_dswap	$alo
+	stg	$j,$stdframe($j,$sp)	# zap tp
 	stg	$alo,0($j,$rp)
 	la	$j,8($j)
 	brct	$count,.Lcopy
 
-	la	%r1,160+8+48($j,$sp)
-	lmg	%r6,%r15,0(%r1)
+	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+	lm${g}	%r6,%r15,0(%r1)
 	lghi	%r2,1		# signal "processed"
 	br	%r14
 .size	bn_mul_mont,.-bn_mul_mont
 .string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 
-print $code;
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+	print $_,"\n";
+}
 close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl
new file mode 100644
index 0000000000..808a1e5969
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and �-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII		16%-30%
+# P4		12%-12%
+# Opteron	18%-40%
+# Core2		19%-44%
+# Atom		38%-64%
+# Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+					if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &and	($a1,0x3fffffff);
+	 &lea	($a4,&DWP(0,$a2,$a2));
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	&movd	($A,$a);
+	&movd	($B,$b);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	&pxor	($B31,$B31);
+	&pxor	($B30,$B30);
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	&pcmpgtd($B31,$A);		# broadcast 31st bit
+	&paddd	($A,$A);		# $A<<=1
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	&pand	($B31,$B);
+	&pcmpgtd($B30,$A);		# broadcast 30th bit
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&psllq	($B31,31);
+	&pand	($B30,$B);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&mov	(@i[0],0x7);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	 &mov	($a4,@i[0]);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	&mov	(@i[1],$a4);
+	&psllq	($B30,30);
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	&movd	($R,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],$a4);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],$a4);
+		&psllq	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	($b,3);
+		&pxor	($R,@T[1]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+	&pxor	($R,$B30);
+	&psllq	(@T[1],3*$n++);
+	&pxor	($R,@T[1]);
+
+	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
+	&pxor	($R,$B31);
+	&psllq	(@T[0],3*$n);
+	&add	("esp",32+4);
+	&pxor	($R,@T[0]);
+	&ret	();
+&function_end_B("_mul_1x1_mmx");
+					}
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &lea	($a4,&DWP(0,"",$a,4));
+	 &and	($a1,0x3fffffff);
+	&lea	(@i[1],&DWP(0,$lo,$lo));
+	&sar	($lo,31);		# broadcast 31st bit
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&sar	(@i[1],31);		# broardcast 30th bit
+	&and	($lo,$b);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&and	(@i[1],$b);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	&mov	($hi,$lo);
+	&shl	($lo,31);
+	&mov	(@T[0],@i[1]);
+	&shr	($hi,1);
+
+	 &mov	(@i[0],0x7);
+	&shl	(@i[1],30);
+	 &and	(@i[0],$b);
+	&shr	(@T[0],2);
+	&xor	($lo,@i[1]);
+
+	&shr	($b,3);
+	&mov	(@i[1],0x7);		# 5-byte instruction!?
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	 &xor	($hi,@T[0]);
+	&xor	($lo,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],0x7);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],0x7);
+		&mov	(@T[0],@T[1]);
+		&shl	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	(@T[0],32-3*$n);
+		&xor	($lo,@T[1]);
+		&shr	($b,3);
+		&xor	($hi,@T[0]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+	&mov	(@T[0],@T[1]);
+	&shl	(@T[1],3*$n);
+	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
+	&shr	(@T[0],32-3*$n);	$n++;
+	&mov	(@i[0],@i[1]);
+	&xor	($lo,@T[1]);
+	&shl	(@i[1],3*$n);
+	&xor	($hi,@T[0]);
+	&shr	(@i[0],32-3*$n);
+	&xor	($lo,@i[1]);
+	&xor	($hi,@i[0]);
+
+	&add	("esp",32+4);
+	&ret	();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&mov	("eax",&DWP(0,"edx"));
+	&mov	("edx",&DWP(4,"edx"));
+	&test	("eax",1<<23);		# check MMX bit
+	&jz	(&label("ialu"));
+if ($sse2) {
+	&test	("eax",1<<24);		# check FXSR bit
+	&jz	(&label("mmx"));
+	&test	("edx",1<<1);		# check PCLMULQDQ bit
+	&jz	(&label("mmx"));
+
+	&movups		("xmm0",&QWP(8,"esp"));
+	&shufps		("xmm0","xmm0",0b10110001);
+	&pclmulqdq	("xmm0","xmm0",1);
+	&mov		("eax",&DWP(4,"esp"));
+	&movups		(&QWP(0,"eax"),"xmm0");
+	&ret	();
+
+&set_label("mmx",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_mmx");	# a1�b1
+	&movq	("mm7",$R);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# a0�b0
+	&movq	("mm6",$R);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# (a0+a1)�(b0+b1)
+	&pxor	($R,"mm7");
+	&mov	($a,&wparam(0));
+	&pxor	($R,"mm6");		# (a0+a1)�(b0+b1)-a1�b1-a0�b0
+
+	&movq	($A,$R);
+	&psllq	($R,32);
+	&pop	("edi");
+	&psrlq	($A,32);
+	&pop	("esi");
+	&pxor	($R,"mm6");
+	&pop	("ebx");
+	&pxor	($A,"mm7");
+	&movq	(&QWP(0,$a),$R);
+	&pop	("ebp");
+	&movq	(&QWP(8,$a),$A);
+	&emms	();
+	&ret	();
+&set_label("ialu",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&stack_push(4+1);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_ialu");	# a1�b1
+	&mov	(&DWP(8,"esp"),$lo);
+	&mov	(&DWP(12,"esp"),$hi);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# a0�b0
+	&mov	(&DWP(0,"esp"),$lo);
+	&mov	(&DWP(4,"esp"),$hi);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# (a0+a1)�(b0+b1)
+
+	&mov	("ebp",&wparam(0));
+		 @r=("ebx","ecx","edi","esi");
+	&mov	(@r[0],&DWP(0,"esp"));
+	&mov	(@r[1],&DWP(4,"esp"));
+	&mov	(@r[2],&DWP(8,"esp"));
+	&mov	(@r[3],&DWP(12,"esp"));
+
+	&xor	($lo,$hi);
+	&xor	($hi,@r[1]);
+	&xor	($lo,@r[0]);
+	&mov	(&DWP(0,"ebp"),@r[0]);
+	&xor	($hi,@r[2]);
+	&mov	(&DWP(12,"ebp"),@r[3]);
+	&xor	($lo,@r[3]);
+	&stack_pop(4+1);
+	&xor	($hi,@r[3]);
+	&pop	("edi");
+	&xor	($lo,$hi);
+	&pop	("esi");
+	&mov	(&DWP(8,"ebp"),$hi);
+	&pop	("ebx");
+	&mov	(&DWP(4,"ebp"),$lo);
+	&pop	("ebp");
+	&ret	();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
new file mode 100644
index 0000000000..1658acbbdd
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,389 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and �-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+($lo,$hi)=("%rax","%rdx");	$a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@abi-omnipotent
+.align	16
+_mul_1x1:
+	sub	\$128+8,%rsp
+	mov	\$-1,$a1
+	lea	($a,$a),$i0
+	shr	\$3,$a1
+	lea	(,$a,4),$i1
+	and	$a,$a1			# a1=a&0x1fffffffffffffff
+	lea	(,$a,8),$a8
+	sar	\$63,$a			# broadcast 63rd bit
+	lea	($a1,$a1),$a2
+	sar	\$63,$i0		# broadcast 62nd bit
+	lea	(,$a1,4),$a4
+	and	$b,$a
+	sar	\$63,$i1		# boardcast 61st bit
+	mov	$a,$hi			# $a is $lo
+	shl	\$63,$lo
+	and	$b,$i0
+	shr	\$1,$hi
+	mov	$i0,$t1
+	shl	\$62,$i0
+	and	$b,$i1
+	shr	\$2,$t1
+	xor	$i0,$lo
+	mov	$i1,$t0
+	shl	\$61,$i1
+	xor	$t1,$hi
+	shr	\$3,$t0
+	xor	$i1,$lo
+	xor	$t0,$hi
+
+	mov	$a1,$a12
+	movq	\$0,0(%rsp)		# tab[0]=0
+	xor	$a2,$a12		# a1^a2
+	mov	$a1,8(%rsp)		# tab[1]=a1
+	 mov	$a4,$a48
+	mov	$a2,16(%rsp)		# tab[2]=a2
+	 xor	$a8,$a48		# a4^a8
+	mov	$a12,24(%rsp)		# tab[3]=a1^a2
+
+	xor	$a4,$a1
+	mov	$a4,32(%rsp)		# tab[4]=a4
+	xor	$a4,$a2
+	mov	$a1,40(%rsp)		# tab[5]=a1^a4
+	xor	$a4,$a12
+	mov	$a2,48(%rsp)		# tab[6]=a2^a4
+	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
+	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
+	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
+
+	mov	$a8,64(%rsp)		# tab[8]=a8
+	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
+	mov	$a1,72(%rsp)		# tab[9]=a1^a8
+	 xor	$a4,$a1			# a1^a8^a4
+	mov	$a2,80(%rsp)		# tab[10]=a2^a8
+	 xor	$a4,$a2			# a2^a8^a4
+	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
+
+	xor	$a4,$a12		# a1^a2^a8^a4
+	mov	$a48,96(%rsp)		# tab[12]=a4^a8
+	 mov	$mask,$i0
+	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
+	 and	$b,$i0
+	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
+	 shr	\$4,$b
+	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
+	 mov	$mask,$i1
+	 and	$b,$i1
+	 shr	\$4,$b
+
+	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
+	mov	$mask,$i0
+	and	$b,$i0
+	shr	\$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+	$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$mask,$i1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	and	$b,$i1
+	 movq	(%rsp,$i0,8),$Tx
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	 pslldq	\$$n,$Tx
+	 mov	$mask,$i0
+	shr	\$4,$b
+	xor	$t0,$hi
+	 and	$b,$i0
+	 shr	\$4,$b
+	 pxor	$Tx,$R
+___
+    }
+$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	movq	$R,$i0
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	psrldq	\$8,$R
+	xor	$t0,$hi
+	movq	$R,$i1
+	xor	$i0,$lo
+	xor	$i1,$hi
+
+	add	\$128+8,%rsp
+	ret
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
+				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@abi-omnipotent
+.align	16
+bn_GF2m_mul_2x2:
+	mov	OPENSSL_ia32cap_P(%rip),%rax
+	bt	\$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+	movq		$a1,%xmm0
+	movq		$b1,%xmm1
+	movq		$a0,%xmm2
+___
+$code.=<<___ if ($win64);
+	movq		40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+	movq		$b0,%xmm3
+___
+$code.=<<___;
+	movdqa		%xmm0,%xmm4
+	movdqa		%xmm1,%xmm5
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1�b1
+	pxor		%xmm2,%xmm4
+	pxor		%xmm3,%xmm5
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0�b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)�(b0+b1)
+	xorps		%xmm0,%xmm4
+	xorps		%xmm2,%xmm4	# (a0+a1)�(b0+b1)-a0�b0-a1�b1
+	movdqa		%xmm4,%xmm5
+	pslldq		\$8,%xmm4
+	psrldq		\$8,%xmm5
+	pxor		%xmm4,%xmm2
+	pxor		%xmm5,%xmm0
+	movdqu		%xmm2,0($rp)
+	movdqu		%xmm0,16($rp)
+	ret
+
+.align	16
+.Lvanilla_mul_2x2:
+	lea	-8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	mov	`8*17+40`(%rsp),$b0
+	mov	%rdi,8*15(%rsp)
+	mov	%rsi,8*16(%rsp)
+___
+$code.=<<___;
+	mov	%r14,8*10(%rsp)
+	mov	%r13,8*11(%rsp)
+	mov	%r12,8*12(%rsp)
+	mov	%rbp,8*13(%rsp)
+	mov	%rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+	mov	$rp,32(%rsp)		# save the arguments
+	mov	$a1,40(%rsp)
+	mov	$a0,48(%rsp)
+	mov	$b1,56(%rsp)
+	mov	$b0,64(%rsp)
+
+	mov	\$0xf,$mask
+	mov	$a1,$a
+	mov	$b1,$b
+	call	_mul_1x1		# a1�b1
+	mov	$lo,16(%rsp)
+	mov	$hi,24(%rsp)
+
+	mov	48(%rsp),$a
+	mov	64(%rsp),$b
+	call	_mul_1x1		# a0�b0
+	mov	$lo,0(%rsp)
+	mov	$hi,8(%rsp)
+
+	mov	40(%rsp),$a
+	mov	56(%rsp),$b
+	xor	48(%rsp),$a
+	xor	64(%rsp),$b
+	call	_mul_1x1		# (a0+a1)�(b0+b1)
+___
+	@r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+	mov	0(%rsp),@r[0]
+	mov	8(%rsp),@r[1]
+	mov	16(%rsp),@r[2]
+	mov	24(%rsp),@r[3]
+	mov	32(%rsp),%rbp
+
+	xor	$hi,$lo
+	xor	@r[1],$hi
+	xor	@r[0],$lo
+	mov	@r[0],0(%rbp)
+	xor	@r[2],$hi
+	mov	@r[3],24(%rbp)
+	xor	@r[3],$lo
+	xor	@r[3],$hi
+	xor	$hi,$lo
+	mov	$hi,16(%rbp)
+	mov	$lo,8(%rbp)
+
+	mov	8*10(%rsp),%r14
+	mov	8*11(%rsp),%r13
+	mov	8*12(%rsp),%r12
+	mov	8*13(%rsp),%rbp
+	mov	8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+	mov	8*15(%rsp),%rdi
+	mov	8*16(%rsp),%rsi
+___
+$code.=<<___;
+	lea	8*17(%rsp),%rsp
+	ret
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody_mul_2x2(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_prologue
+
+	mov	8*10(%rax),%r14		# mimic epilogue
+	mov	8*11(%rax),%r13
+	mov	8*12(%rax),%r12
+	mov	8*13(%rax),%rbp
+	mov	8*14(%rax),%rbx
+	mov	8*15(%rax),%rdi
+	mov	8*16(%rax),%rsi
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+.Lin_prologue:
+	lea	8*17(%rax),%rax
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	_mul_1x1
+	.rva	.Lend_mul_1x1
+	.rva	.LSEH_info_1x1
+
+	.rva	.Lvanilla_mul_2x2
+	.rva	.Lend_mul_2x2
+	.rva	.LSEH_info_2x2
+.section	.xdata
+.align	8
+.LSEH_info_1x1:
+	.byte	0x01,0x07,0x02,0x00
+	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
+.LSEH_info_2x2:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
index 3b7a6f243f..5d79b35e1c 100755
--- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl
+++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -15,6 +15,20 @@
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
 
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -37,7 +51,6 @@ $n0="%r8";	# const BN_ULONG *n0,
 $num="%r9";	# int num);
 $lo0="%r10";
 $hi0="%r11";
-$bp="%r12";	# reassign $bp
 $hi1="%r13";
 $i="%r14";
 $j="%r15";
@@ -51,6 +64,16 @@ $code=<<___;
 .type	bn_mul_mont,\@function,6
 .align	16
 bn_mul_mont:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	cmp	$ap,$bp
+	jne	.Lmul4x_enter
+	jmp	.Lsqr4x_enter
+
+.align	16
+.Lmul_enter:
 	push	%rbx
 	push	%rbp
 	push	%r12
@@ -66,48 +89,66 @@ bn_mul_mont:
 	and	\$-1024,%rsp		# minimize TLB usage
 
 	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
-.Lprologue:
-	mov	%rdx,$bp		# $bp reassigned, remember?
-
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
 	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
 
 	xor	$i,$i			# i=0
 	xor	$j,$j			# j=0
 
-	mov	($bp),$m0		# m0=bp[0]
-	mov	($ap),%rax
+	mov	$n0,$m1
 	mulq	$m0			# ap[0]*bp[0]
 	mov	%rax,$lo0
-	mov	%rdx,$hi0
+	mov	($np),%rax
 
-	imulq	$n0,%rax		# "tp[0]"*n0
-	mov	%rax,$m1
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
 
-	mulq	($np)			# np[0]*m1
-	add	$lo0,%rax		# discarded
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
 .L1st:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[0]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
-	mov	%rdx,$hi0
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
+	cmp	$num,$j
+	jne	.L1st
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[0]
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
 	adc	\$0,%rdx
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
-	cmp	$num,$j
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.L1st
+	mov	$lo0,$hi0
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -116,50 +157,64 @@ bn_mul_mont:
 	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
 
 	lea	1($i),$i		# i++
-.align	4
+	jmp	.Louter
+.align	16
 .Louter:
-	xor	$j,$j			# j=0
-
 	mov	($bp,$i,8),$m0		# m0=bp[i]
-	mov	($ap),%rax		# ap[0]
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
 	mulq	$m0			# ap[0]*bp[i]
-	add	(%rsp),%rax		# ap[0]*bp[i]+tp[0]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
 	adc	\$0,%rdx
-	mov	%rax,$lo0
-	mov	%rdx,$hi0
 
-	imulq	$n0,%rax		# tp[0]*n0
-	mov	%rax,$m1
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
 
-	mulq	($np,$j,8)		# np[0]*m1
-	add	$lo0,%rax		# discarded
-	mov	8(%rsp),$lo0		# tp[1]
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
 	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
 	mov	%rdx,$hi1
 
 	lea	1($j),$j		# j++
-.align	4
+	jmp	.Linner_enter
+
+.align	16
 .Linner:
+	add	%rax,$hi1
 	mov	($ap,$j,8),%rax
-	mulq	$m0			# ap[j]*bp[i]
-	add	$hi0,%rax
 	adc	\$0,%rdx
-	add	%rax,$lo0		# ap[j]*bp[i]+tp[j]
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
 	mov	($np,$j,8),%rax
 	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
 	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
 
 	mulq	$m1			# np[j]*m1
-	add	$hi1,%rax
-	lea	1($j),$j		# j++
-	adc	\$0,%rdx
-	add	$lo0,%rax		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	cmp	$num,$j
+	jne	.Linner
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
 	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
 	mov	(%rsp,$j,8),$lo0
-	cmp	$num,$j
-	mov	%rax,-16(%rsp,$j,8)	# tp[j-1]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
 	mov	%rdx,$hi1
-	jl	.Linner
 
 	xor	%rdx,%rdx
 	add	$hi0,$hi1
@@ -173,35 +228,449 @@ bn_mul_mont:
 	cmp	$num,$i
 	jl	.Louter
 
-	lea	(%rsp),$ap		# borrow ap for tp
-	lea	-1($num),$j		# j=num-1
-
-	mov	($ap),%rax		# tp[0]
 	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
 	jmp	.Lsub
 .align	16
 .Lsub:	sbb	($np,$i,8),%rax
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
-	dec	$j			# doesn't affect CF!
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	jge	.Lsub
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
 	and	%rax,$ap
 	not	%rax
 	mov	$rp,$np
 	and	%rax,$np
-	lea	-1($num),$j
+	mov	$num,$j			# j=num
 	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont,\@function,6
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	4($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
 	mov	($ap,$j,8),%rax
-	mov	%rax,($rp,$j,8)		# rp[i]=tp[i]
-	mov	$i,(%rsp,$j,8)		# zap temporary vector
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)		# tp[j-1]
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
 	dec	$j
-	jge	.Lcopy
+	jnz	.Lcopy4x
 
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 	mov	\$1,%rax
 	mov	(%rsi),%r15
@@ -211,9 +680,823 @@ bn_mul_mont:
 	mov	32(%rsi),%rbp
 	mov	40(%rsi),%rbx
 	lea	48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
 	ret
-.size	bn_mul_mont,.-bn_mul_mont
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi";	# const BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# not used
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 4 and
+			# not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type	bn_sqr4x_mont,\@function,6
+.align	16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	shl	\$3,${num}d		# convert $num to bytes
+	xor	%r10,%r10
+	mov	%rsp,%r11		# put aside %rsp
+	sub	$num,%r10		# -$num
+	mov	($n0),$n0		# *n0
+	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
+	and	\$-1024,%rsp		# minimize TLB usage
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +32	saved $rptr
+	# +40	saved $nptr
+	# +48	saved *n0
+	# +56	saved %rsp
+	# +64	t[2*$num]
+	#
+	mov	$rptr,32(%rsp)		# save $rptr
+	mov	$nptr,40(%rsp)
+	mov	$n0,  48(%rsp)
+	mov	%r11, 56(%rsp)		# save original %rsp
+.Lsqr4x_body:
+	##############################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	lea	32(%r10),$i		# $i=-($num-32)
+	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
+
+	mov	$num,$j			# $j=$num
+
+					# comments apply to $num==8 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	%rax,$A0[0]		# a[1]*a[0]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	16($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[3]
+	jmp	.Lsqr4x_1st
+
+.align	16
+.Lsqr4x_1st:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[5]
+
+	 mov	16($aptr,$j),$ai	# a[6]
+	xor	$A1[0],$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[6]*a[2]
+	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],16($tptr,$j)	# t[6]
+
+
+	 mov	24($aptr,$j),$ai	# a[7]
+	xor	$A1[1],$A1[1]
+	mul	$a1			# a[6]*a[5]
+	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 lea	32($j),$j
+	adc	\$0,$A0[1]
+	mul	$a0			# a[7]*a[4]
+	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[7]
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_1st
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[7]*a[5]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[8]
+	lea	16($i),$i
+	mov	$A1[0],8($tptr)		# t[9]
+	jmp	.Lsqr4x_outer
+
+.align	16
+.Lsqr4x_outer:				# comments apply to $num==6 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mov	-24($tptr,$i),$A0[0]	# t[1]
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	lea	-16($i),$j		# j=-16
+	xor	$A1[0],$A1[0]
+
+
+	 mov	8($aptr,$j),$ai		# a[3]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],8($tptr,$j)	# t[3]
+
+	lea	16($j),$j
+	jmp	.Lsqr4x_inner
+
+.align	16
+.Lsqr4x_inner:
+	 mov	($aptr,$j),$ai		# a[4]
+	xor	$A1[0],$A1[0]
+	add	($tptr,$j),$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[0]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	adc	%rdx,$A0[0]
+	mov	$A0[1],($tptr,$j)	# t[4]
+
+	 mov	8($aptr,$j),$ai		# a[5]
+	xor	$A1[1],$A1[1]
+	add	8($tptr,$j),$A1[0]
+	adc	\$0,$A1[1]
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A1[1]
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	lea	16($j),$j		# j++
+	adc	\$0,$A0[1]
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_inner
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
+	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
+
+	add	\$16,$i
+	jnz	.Lsqr4x_outer
+
+					# comments apply to $num==4 case
+	mov	-32($aptr),$a0		# a[0]
+	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr),%rax		# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr),$ai		# a[2]
+	mov	%rax,$a1
+
+	xor	$A0[1],$A0[1]
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr)	# t[1]
+
+	xor	$A0[0],$A0[0]
+	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
+	adc	\$0,$A0[0]
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[0]
+	mov	$A0[1],-16($tptr)	# t[2]
+
+	 mov	-8($aptr),$ai		# a[3]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+
+	xor	$A0[1],$A0[1]
+	add	$A1[0],$A0[0]
+	 mov	%rdx,$A1[1]
+	adc	\$0,$A0[1]
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	%rdx,$A0[1]
+	mov	$A0[0],-8($tptr)	# t[3]
+
+	xor	$A1[0],$A1[0]
+	add	$A0[1],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]
+	 mov	-16($aptr),%rax		# a[2]
+	adc	%rdx,$A1[0]
+
+	mov	$A1[1],($tptr)		# t[4]
+	mov	$A1[0],8($tptr)		# t[5]
+
+	mul	$ai			# a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+	 add	\$16,$i
+	 xor	$shift,$shift
+	 sub	$num,$i			# $i=16-$num
+	 xor	$carry,$carry
+
+	add	$A1[0],%rax		# t[5]
+	adc	\$0,%rdx
+	mov	%rax,8($tptr)		# t[5]
+	mov	%rdx,16($tptr)		# t[6]
+	mov	$carry,24($tptr)	# t[7]
+
+	 mov	-16($aptr,$i),%rax	# a[0]
+	lea	64(%rsp,$num,2),$tptr
+	 xor	$A0[0],$A0[0]		# t[0]
+	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	lea	16($i),$i
+	mov	$S[3],-40($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	jmp	.Lsqr4x_shift_n_add
+
+.align	16
+.Lsqr4x_shift_n_add:
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr,$i,2)
+	adc	%rdx,$S[3]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	 mov	$S[3],-8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],0($tptr,$i,2)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr,$i,2)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr,$i,2)
+	adc	%rdx,$S[3]
+	mov	$S[3],24($tptr,$i,2)
+	sbb	$carry,$carry		# mov cf,$carry
+	add	\$32,$i
+	jnz	.Lsqr4x_shift_n_add
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	adc	%rax,$S[2]
+	adc	%rdx,$S[3]
+	mov	$S[2],-16($tptr)
+	mov	$S[3],-8($tptr)
+___
+}
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+	mov	40(%rsp),$nptr		# restore $nptr
+	mov	48(%rsp),$n0		# restore *n0
+	xor	$j,$j
+	mov	$num,0(%rsp)		# save $num
+	sub	$num,$j			# $j=-$num
+	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
+	 mov	$n0,$m0			#		# modsched #
+	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
+	lea	64(%rsp,$num),$tptr	# end of t[] window
+	mov	%rax,8(%rsp)		# save end of t[] buffer
+	lea	($nptr,$num),$nptr	# end of n[] buffer
+	xor	$topbit,$topbit		# $topbit=0
+
+	mov	0($nptr,$j),%rax	# n[0]		# modsched #
+	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
+	 mov	%rax,$Ni[0]		#		# modsched #
+	jmp	.Lsqr4x_mont_outer
+
+.align	16
+.Lsqr4x_mont_outer:
+	xor	$A0[1],$A0[1]
+	mul	$m0			# n[0]*m0
+	add	%rax,$A0[0]		# n[0]*m0+t[0]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+	mov	$n0,$m1
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[1]*m0
+	add	%rax,$A0[1]		# n[1]*m0+t[1]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+	imulq	$A0[1],$m1
+
+	mov	16($nptr,$j),$Ni[0]	# n[2]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[0]*m1
+	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[1]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[2]*m0
+	add	%rax,$A0[0]		# n[2]*m0+t[2]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[3]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[1]*m1
+	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[2]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[3]*m0
+	add	%rax,$A0[1]		# n[3]*m0+t[3]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	jmp	.Lsqr4x_mont_inner
+
+.align	16
+.Lsqr4x_mont_inner:
+	mov	($nptr,$j),$Ni[0]	# n[4]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[2]*m1
+	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr,$j)	# "t[3]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[4]*m0
+	add	%rax,$A0[0]		# n[4]*m0+t[4]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	8($nptr,$j),$Ni[1]	# n[5]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[3]*m1
+	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr,$j)	# "t[4]"
+
+	xor	$A0[0],$A0[0]
+	add	8($tptr,$j),$A0[1]
+	adc	\$0,$A0[0]
+	mul	$m0			# n[5]*m0
+	add	%rax,$A0[1]		# n[5]*m0+t[5]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+
+
+	mov	16($nptr,$j),$Ni[0]	# n[6]
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[4]*m1
+	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],8($tptr,$j)	# "t[5]"
+
+	xor	$A0[1],$A0[1]
+	add	16($tptr,$j),$A0[0]
+	adc	\$0,$A0[1]
+	mul	$m0			# n[6]*m0
+	add	%rax,$A0[0]		# n[6]*m0+t[6]
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A0[1]
+
+	mov	24($nptr,$j),$Ni[1]	# n[7]
+	xor	$A1[0],$A1[0]
+	add	$A0[0],$A1[1]
+	adc	\$0,$A1[0]
+	mul	$m1			# n[5]*m1
+	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
+	 mov	$Ni[1],%rax
+	adc	%rdx,$A1[0]
+	mov	$A1[1],16($tptr,$j)	# "t[6]"
+
+	xor	$A0[0],$A0[0]
+	add	24($tptr,$j),$A0[1]
+	lea	32($j),$j
+	adc	\$0,$A0[0]
+	mul	$m0			# n[7]*m0
+	add	%rax,$A0[1]		# n[7]*m0+t[7]
+	 mov	$Ni[0],%rax
+	adc	%rdx,$A0[0]
+	cmp	\$0,$j
+	jne	.Lsqr4x_mont_inner
+
+	 sub	0(%rsp),$j		# $j=-$num	# modsched #
+	 mov	$n0,$m0			#		# modsched #
+
+	xor	$A1[1],$A1[1]
+	add	$A0[1],$A1[0]
+	adc	\$0,$A1[1]
+	mul	$m1			# n[6]*m1
+	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
+	mov	$Ni[1],%rax
+	adc	%rdx,$A1[1]
+	mov	$A1[0],-8($tptr)	# "t[7]"
+
+	xor	$A0[1],$A0[1]
+	add	($tptr),$A0[0]		# +t[8]
+	adc	\$0,$A0[1]
+	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
+	add	$topbit,$A0[0]
+	adc	\$0,$A0[1]
+
+	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
+	xor	$A1[0],$A1[0]
+	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
+	add	$A0[0],$A1[1]
+	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
+	adc	\$0,$A1[0]
+	mul	$m1			# n[7]*m1
+	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
+	 mov	$Ni[0],%rax		#		# modsched #
+	adc	%rdx,$A1[0]
+	mov	$A1[1],($tptr)		# "t[8]"
+
+	xor	$topbit,$topbit
+	add	8($tptr),$A1[0]		# +t[9]
+	adc	$topbit,$topbit
+	add	$A0[1],$A1[0]
+	lea	16($tptr),$tptr		# "t[$num]>>128"
+	adc	\$0,$topbit
+	mov	$A1[0],-8($tptr)	# "t[9]"
+	cmp	8(%rsp),$tptr		# are we done?
+	jb	.Lsqr4x_mont_outer
+
+	mov	0(%rsp),$num		# restore $num
+	mov	$topbit,($tptr)		# save $topbit
+___
+}
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+	mov	64(%rsp,$num),@ri[0]	# tp[0]
+	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
+	mov	40(%rsp),$nptr		# restore $nptr
+	shr	\$5,$num		# num/4
+	mov	8($tptr),@ri[1]		# t[1]
+	xor	$i,$i			# i=0 and clear CF!
+
+	mov	32(%rsp),$rptr		# restore $rptr
+	sub	0($nptr),@ri[0]
+	mov	16($tptr),@ri[2]	# t[2]
+	mov	24($tptr),@ri[3]	# t[3]
+	sbb	8($nptr),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsqr4x_sub
+.align	16
+.Lsqr4x_sub:
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
+	mov	40($tptr,$i,8),@ri[1]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($nptr,$i,8),@ri[0]
+	mov	48($tptr,$i,8),@ri[2]
+	mov	56($tptr,$i,8),@ri[3]
+	sbb	40($nptr,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesn't affect CF!
+	jnz	.Lsqr4x_sub
+
+	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
+	sbb	16($nptr,$i,8),@ri[2]
+	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($nptr,$i,8),@ri[3]
+	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$tptr
+	not	@ri[0]
+	mov	$rptr,$nptr
+	and	@ri[0],$nptr
+	lea	-1($num),$j
+	or	$nptr,$tptr		# tp=borrow?tp:rp
+
+	pxor	%xmm0,%xmm0
+	lea	64(%rsp,$num,8),$nptr
+	movdqu	($tptr),%xmm1
+	lea	($nptr,$num,8),$nptr
+	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
+	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
+	movdqu	%xmm1,($rptr)
+	jmp	.Lsqr4x_copy
+.align	16
+.Lsqr4x_copy:				# copy or in-place refresh
+	movdqu	16($tptr,$i),%xmm2
+	movdqu	32($tptr,$i),%xmm1
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+	movdqu	%xmm1,32($rptr,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lsqr4x_copy
+
+	movdqu	16($tptr,$i),%xmm2
+	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
+	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
+	movdqu	%xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+	mov	56(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lsqr4x_epilogue:
+	ret
+.size	bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
 .asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
 ___
@@ -228,9 +1511,9 @@ $disp="%r9";
 
 $code.=<<___;
 .extern	__imp_RtlVirtualUnwind
-.type	se_handler,\@abi-omnipotent
+.type	mul_handler,\@abi-omnipotent
 .align	16
-se_handler:
+mul_handler:
 	push	%rsi
 	push	%rdi
 	push	%rbx
@@ -245,15 +1528,20 @@ se_handler:
 	mov	120($context),%rax	# pull context->Rax
 	mov	248($context),%rbx	# pull context->Rip
 
-	lea	.Lprologue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
-	lea	.Lepilogue(%rip),%r10
-	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
 
 	mov	192($context),%r10	# pull $num
 	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
@@ -272,7 +1560,53 @@ se_handler:
 	mov	%r14,232($context)	# restore context->R14
 	mov	%r15,240($context)	# restore context->R15
 
-.Lin_prologue:
+	jmp	.Lcommon_seh_tail
+.size	mul_handler,.-mul_handler
+
+.type	sqr_handler,\@abi-omnipotent
+.align	16
+sqr_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lsqr4x_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lsqr4x_epilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
+	jae	.Lcommon_seh_tail
+
+	mov	56(%rax),%rax		# pull saved stack pointer
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -310,7 +1644,7 @@ se_handler:
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	sqr_handler,.-sqr_handler
 
 .section	.pdata
 .align	4
@@ -318,11 +1652,27 @@ se_handler:
 	.rva	.LSEH_end_bn_mul_mont
 	.rva	.LSEH_info_bn_mul_mont
 
+	.rva	.LSEH_begin_bn_mul4x_mont
+	.rva	.LSEH_end_bn_mul4x_mont
+	.rva	.LSEH_info_bn_mul4x_mont
+
+	.rva	.LSEH_begin_bn_sqr4x_mont
+	.rva	.LSEH_end_bn_sqr4x_mont
+	.rva	.LSEH_info_bn_sqr4x_mont
+
 .section	.xdata
 .align	8
 .LSEH_info_bn_mul_mont:
 	.byte	9,0,0,0
-	.rva	se_handler
+	.rva	mul_handler
+	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
 ___
 }
 
diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
new file mode 100755
index 0000000000..057cda28aa
--- /dev/null
+++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl
@@ -0,0 +1,1070 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# int bn_mul_mont_gather5(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num,
+		# int idx);	# 0 to 2^5-1, "index" in $bp holding
+				# pre-computed powers of a', interlaced
+				# in such manner that b[0] is $bp[idx],
+				# b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,\@function,6
+.align	64
+bn_mul_mont_gather5:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	2($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.L1st
+
+	movq	%xmm0,$m0		# bp[1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	mov	$lo0,$hi0
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	jmp	.Louter
+.align	16
+.Louter:
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.Linner
+
+	movq	%xmm0,$m0		# bp[i+1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jl	.Louter
+
+	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont_gather5,\@function,6
+.align	16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+	mov	${num}d,${num}d
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	lea	4($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	xor	$j,$j			# j=0
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-40(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jl	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[i+1]
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jl	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lcopy4x
+
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsi),%xmm6
+	movaps	0x10(%rsi),%xmm7
+	lea	0x28(%rsi),%rsi
+___
+$code.=<<___;
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	ret
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+				("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl	bn_scatter5
+.type	bn_scatter5,\@abi-omnipotent
+.align	16
+bn_scatter5:
+	cmp	\$0, $num
+	jz	.Lscatter_epilogue
+	lea	($tbl,$idx,8),$tbl
+.Lscatter:
+	mov	($inp),%rax
+	lea	8($inp),$inp
+	mov	%rax,($tbl)
+	lea	32*8($tbl),$tbl
+	sub	\$1,$num
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,\@abi-omnipotent
+.align	16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	mov	$idx,%r11
+	shr	\$`log($N/8)/log(2)`,$idx
+	and	\$`$N/8-1`,%r11
+	not	$idx
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
+	lea	96($tbl,%r11,8),$tbl	# pointer within 1st cache line
+	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
+	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,$idx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	`0*$STRIDE/4-96`($tbl),%xmm0
+	movq	`1*$STRIDE/4-96`($tbl),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($tbl),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($tbl),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($tbl),$tbl
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,($out)		# m0=bp[0]
+	lea	8($out),$out
+	sub	\$1,$num
+	jnz	.Lgather
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	lea	0x28(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align	64
+.Lmagic_masks:
+	.long	0,0, 0,0, 0,0, -1,-1
+	.long	0,0, 0,0, 0,0,  0,0
+.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	lea	`40+48`(%rax),%rax
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# end of alloca label
+	cmp	%r10,%rbx		# context->Rip<end of alloca label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	8(%r11),%r10d		# HandlerData[2]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+
+	movaps	(%rax),%xmm0
+	movaps	16(%rax),%xmm1
+	lea	`40+48`(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+	movups	%xmm0,512($context)	# restore context->Xmm6
+	movups	%xmm1,528($context)	# restore context->Xmm7
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mul_handler,.-mul_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont_gather5
+	.rva	.LSEH_end_bn_mul_mont_gather5
+	.rva	.LSEH_info_bn_mul_mont_gather5
+
+	.rva	.LSEH_begin_bn_mul4x_mont_gather5
+	.rva	.LSEH_end_bn_mul4x_mont_gather5
+	.rva	.LSEH_info_bn_mul4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_gather5
+	.rva	.LSEH_end_bn_gather5
+	.rva	.LSEH_info_bn_gather5
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul_alloca,.Lmul_body,.Lmul_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_mul4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.align	8
+.LSEH_info_bn_gather5:
+        .byte   0x01,0x0d,0x05,0x00
+        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
+        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
+.align	8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h
index a0bc47837d..f34248ec4f 100644
--- a/src/lib/libcrypto/bn/bn.h
+++ b/src/lib/libcrypto/bn/bn.h
@@ -558,6 +558,17 @@ int	BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb);
 int	BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx,
 		int do_trial_division, BN_GENCB *cb);
 
+int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx);
+
+int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2,
+			const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb);
+int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2,
+			BIGNUM *Xp1, BIGNUM *Xp2,
+			const BIGNUM *Xp,
+			const BIGNUM *e, BN_CTX *ctx,
+			BN_GENCB *cb);
+
 BN_MONT_CTX *BN_MONT_CTX_new(void );
 void BN_MONT_CTX_init(BN_MONT_CTX *ctx);
 int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b,
@@ -612,6 +623,8 @@ int	BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
 int	BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
 	BN_RECP_CTX *recp, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Functions for arithmetic over binary polynomials represented by BIGNUMs. 
  *
  * The BIGNUM::neg property of BIGNUMs representing binary polynomials is
@@ -663,6 +676,8 @@ int	BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a,
 int	BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max);
 int	BN_GF2m_arr2poly(const int p[], BIGNUM *a);
 
+#endif
+
 /* faster mod functions for the 'NIST primes' 
  * 0 <= a < p^2 */
 int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c
index 802a43d642..52b3304293 100644
--- a/src/lib/libcrypto/bn/bn_div.c
+++ b/src/lib/libcrypto/bn/bn_div.c
@@ -169,15 +169,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d,
 #endif /* OPENSSL_NO_ASM */
 
 
-/* BN_div[_no_branch] computes  dv := num / divisor,  rounding towards
+/* BN_div computes  dv := num / divisor,  rounding towards
  * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
  * Thus:
  *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
  *     rm->neg == num->neg                 (unless the remainder is zero)
  * If 'dv' or 'rm' is NULL, the respective value is not returned.
  */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
-        const BIGNUM *divisor, BN_CTX *ctx);
 int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	   BN_CTX *ctx)
 	{
@@ -186,6 +184,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	BN_ULONG *resp,*wnump;
 	BN_ULONG d0,d1;
 	int num_n,div_n;
+	int no_branch=0;
 
 	/* Invalid zero-padding would have particularly bad consequences
 	 * in the case of 'num', so don't just rely on bn_check_top() for this one
@@ -200,7 +199,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 
 	if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0))
 		{
-		return BN_div_no_branch(dv, rm, num, divisor, ctx);
+		no_branch=1;
 		}
 
 	bn_check_top(dv);
@@ -214,7 +213,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 		return(0);
 		}
 
-	if (BN_ucmp(num,divisor) < 0)
+	if (!no_branch && BN_ucmp(num,divisor) < 0)
 		{
 		if (rm != NULL)
 			{ if (BN_copy(rm,num) == NULL) return(0); }
@@ -239,242 +238,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
 	norm_shift+=BN_BITS2;
 	if (!(BN_lshift(snum,num,norm_shift))) goto err;
 	snum->neg=0;
-	div_n=sdiv->top;
-	num_n=snum->top;
-	loop=num_n-div_n;
-	/* Lets setup a 'window' into snum
-	 * This is the part that corresponds to the current
-	 * 'area' being divided */
-	wnum.neg   = 0;
-	wnum.d     = &(snum->d[loop]);
-	wnum.top   = div_n;
-	/* only needed when BN_ucmp messes up the values between top and max */
-	wnum.dmax  = snum->dmax - loop; /* so we don't step out of bounds */
-
-	/* Get the top 2 words of sdiv */
-	/* div_n=sdiv->top; */
-	d0=sdiv->d[div_n-1];
-	d1=(div_n == 1)?0:sdiv->d[div_n-2];
-
-	/* pointer to the 'top' of snum */
-	wnump= &(snum->d[num_n-1]);
-
-	/* Setup to 'res' */
-	res->neg= (num->neg^divisor->neg);
-	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop;
-	resp= &(res->d[loop-1]);
-
-	/* space for temp */
-	if (!bn_wexpand(tmp,(div_n+1))) goto err;
 
-	if (BN_ucmp(&wnum,sdiv) >= 0)
+	if (no_branch)
 		{
-		/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
-		 * bn_pollute) the const bignum arguments =>
-		 * clean the values between top and max again */
-		bn_clear_top2max(&wnum);
-		bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
-		*resp=1;
-		}
-	else
-		res->top--;
-	/* if res->top == 0 then clear the neg value otherwise decrease
-	 * the resp pointer */
-	if (res->top == 0)
-		res->neg = 0;
-	else
-		resp--;
-
-	for (i=0; i<loop-1; i++, wnump--, resp--)
-		{
-		BN_ULONG q,l0;
-		/* the first part of the loop uses the top two words of
-		 * snum and sdiv to calculate a BN_ULONG q such that
-		 * | wnum - sdiv * q | < sdiv */
-#if defined(BN_DIV3W) && !defined(OPENSSL_NO_ASM)
-		BN_ULONG bn_div_3_words(BN_ULONG*,BN_ULONG,BN_ULONG);
-		q=bn_div_3_words(wnump,d1,d0);
-#else
-		BN_ULONG n0,n1,rem=0;
-
-		n0=wnump[0];
-		n1=wnump[-1];
-		if (n0 == d0)
-			q=BN_MASK2;
-		else 			/* n0 < d0 */
-			{
-#ifdef BN_LLONG
-			BN_ULLONG t2;
-
-#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(bn_div_words)
-			q=(BN_ULONG)(((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0);
-#else
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			/*
-			 * rem doesn't have to be BN_ULLONG. The least we
-			 * know it's less that d0, isn't it?
-			 */
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-			t2=(BN_ULLONG)d1*q;
-
-			for (;;)
-				{
-				if (t2 <= ((((BN_ULLONG)rem)<<BN_BITS2)|wnump[-2]))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				t2 -= d1;
-				}
-#else /* !BN_LLONG */
-			BN_ULONG t2l,t2h;
-
-			q=bn_div_words(n0,n1,d0);
-#ifdef BN_DEBUG_LEVITTE
-			fprintf(stderr,"DEBUG: bn_div_words(0x%08X,0x%08X,0x%08\
-X) -> 0x%08X\n",
-				n0, n1, d0, q);
-#endif
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-			rem=(n1-q*d0)&BN_MASK2;
-#endif
-
-#if defined(BN_UMULT_LOHI)
-			BN_UMULT_LOHI(t2l,t2h,d1,q);
-#elif defined(BN_UMULT_HIGH)
-			t2l = d1 * q;
-			t2h = BN_UMULT_HIGH(d1,q);
-#else
+		/* Since we don't know whether snum is larger than sdiv,
+		 * we pad snum with enough zeroes without changing its
+		 * value. 
+		 */
+		if (snum->top <= sdiv->top+1) 
 			{
-			BN_ULONG ql, qh;
-			t2l=LBITS(d1); t2h=HBITS(d1);
-			ql =LBITS(q);  qh =HBITS(q);
-			mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+			if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
+			for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
+			snum->top = sdiv->top + 2;
 			}
-#endif
-
-			for (;;)
-				{
-				if ((t2h < rem) ||
-					((t2h == rem) && (t2l <= wnump[-2])))
-					break;
-				q--;
-				rem += d0;
-				if (rem < d0) break; /* don't let rem overflow */
-				if (t2l < d1) t2h--; t2l -= d1;
-				}
-#endif /* !BN_LLONG */
-			}
-#endif /* !BN_DIV3W */
-
-		l0=bn_mul_words(tmp->d,sdiv->d,div_n,q);
-		tmp->d[div_n]=l0;
-		wnum.d--;
-		/* ingore top values of the bignums just sub the two 
-		 * BN_ULONG arrays with bn_sub_words */
-		if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1))
+		else
 			{
-			/* Note: As we have considered only the leading
-			 * two BN_ULONGs in the calculation of q, sdiv * q
-			 * might be greater than wnum (but then (q-1) * sdiv
-			 * is less or equal than wnum)
-			 */
-			q--;
-			if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n))
-				/* we can't have an overflow here (assuming
-				 * that q != 0, but if q == 0 then tmp is
-				 * zero anyway) */
-				(*wnump)++;
+			if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
+			snum->d[snum->top] = 0;
+			snum->top ++;
 			}
-		/* store part of the result */
-		*resp = q;
-		}
-	bn_correct_top(snum);
-	if (rm != NULL)
-		{
-		/* Keep a copy of the neg flag in num because if rm==num
-		 * BN_rshift() will overwrite it.
-		 */
-		int neg = num->neg;
-		BN_rshift(rm,snum,norm_shift);
-		if (!BN_is_zero(rm))
-			rm->neg = neg;
-		bn_check_top(rm);
-		}
-	BN_CTX_end(ctx);
-	return(1);
-err:
-	bn_check_top(rm);
-	BN_CTX_end(ctx);
-	return(0);
-	}
-
-
-/* BN_div_no_branch is a special version of BN_div. It does not contain
- * branches that may leak sensitive information.
- */
-static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, 
-	const BIGNUM *divisor, BN_CTX *ctx)
-	{
-	int norm_shift,i,loop;
-	BIGNUM *tmp,wnum,*snum,*sdiv,*res;
-	BN_ULONG *resp,*wnump;
-	BN_ULONG d0,d1;
-	int num_n,div_n;
-
-	bn_check_top(dv);
-	bn_check_top(rm);
-	/* bn_check_top(num); */ /* 'num' has been checked in BN_div() */
-	bn_check_top(divisor);
-
-	if (BN_is_zero(divisor))
-		{
-		BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO);
-		return(0);
-		}
-
-	BN_CTX_start(ctx);
-	tmp=BN_CTX_get(ctx);
-	snum=BN_CTX_get(ctx);
-	sdiv=BN_CTX_get(ctx);
-	if (dv == NULL)
-		res=BN_CTX_get(ctx);
-	else	res=dv;
-	if (sdiv == NULL || res == NULL) goto err;
-
-	/* First we normalise the numbers */
-	norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2);
-	if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err;
-	sdiv->neg=0;
-	norm_shift+=BN_BITS2;
-	if (!(BN_lshift(snum,num,norm_shift))) goto err;
-	snum->neg=0;
-
-	/* Since we don't know whether snum is larger than sdiv,
-	 * we pad snum with enough zeroes without changing its
-	 * value. 
-	 */
-	if (snum->top <= sdiv->top+1) 
-		{
-		if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err;
-		for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0;
-		snum->top = sdiv->top + 2;
-		}
-	else
-		{
-		if (bn_wexpand(snum, snum->top + 1) == NULL) goto err;
-		snum->d[snum->top] = 0;
-		snum->top ++;
 		}
 
 	div_n=sdiv->top;
@@ -500,12 +282,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num,
 	/* Setup to 'res' */
 	res->neg= (num->neg^divisor->neg);
 	if (!bn_wexpand(res,(loop+1))) goto err;
-	res->top=loop-1;
+	res->top=loop-no_branch;
 	resp= &(res->d[loop-1]);
 
 	/* space for temp */
 	if (!bn_wexpand(tmp,(div_n+1))) goto err;
 
+	if (!no_branch)
+		{
+		if (BN_ucmp(&wnum,sdiv) >= 0)
+			{
+			/* If BN_DEBUG_RAND is defined BN_ucmp changes (via
+			 * bn_pollute) the const bignum arguments =>
+			 * clean the values between top and max again */
+			bn_clear_top2max(&wnum);
+			bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+			*resp=1;
+			}
+		else
+			res->top--;
+		}
+
 	/* if res->top == 0 then clear the neg value otherwise decrease
 	 * the resp pointer */
 	if (res->top == 0)
@@ -638,7 +435,7 @@ X) -> 0x%08X\n",
 			rm->neg = neg;
 		bn_check_top(rm);
 		}
-	bn_correct_top(res);
+	if (no_branch)	bn_correct_top(res);
 	BN_CTX_end(ctx);
 	return(1);
 err:
@@ -646,5 +443,4 @@ err:
 	BN_CTX_end(ctx);
 	return(0);
 	}
-
 #endif
diff --git a/src/lib/libcrypto/bn/bn_exp.c b/src/lib/libcrypto/bn/bn_exp.c
index d9b6c737fc..2abf6fd678 100644
--- a/src/lib/libcrypto/bn/bn_exp.c
+++ b/src/lib/libcrypto/bn/bn_exp.c
@@ -113,6 +113,18 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#include <stdlib.h>
+#ifdef _WIN32
+# include <malloc.h>
+# ifndef alloca
+#  define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+#  define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
 /* maximum precomputation table size for *variable* sliding windows */
 #define TABLE_SIZE	32
 
@@ -522,23 +534,17 @@ err:
  * as cache lines are concerned.  The following functions are used to transfer a BIGNUM
  * from/to that table. */
 
-static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width)
+static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width)
 	{
 	size_t i, j;
 
-	if (bn_wexpand(b, top) == NULL)
-		return 0;
-	while (b->top < top)
-		{
-		b->d[b->top++] = 0;
-		}
-	
+	if (top > b->top)
+		top = b->top; /* this works because 'buf' is explicitly zeroed */
 	for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width)
 		{
 		buf[j] = ((unsigned char*)b->d)[i];
 		}
 
-	bn_correct_top(b);
 	return 1;
 	}
 
@@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 
 /* Given a pointer value, compute the next address that is a cache line multiple. */
 #define MOD_EXP_CTIME_ALIGN(x_) \
-	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+	((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
 
 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
  * precomputation memory layout to limit data-dependency to a minimum
@@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 		    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
 	{
-	int i,bits,ret=0,idx,window,wvalue;
+	int i,bits,ret=0,window,wvalue;
 	int top;
- 	BIGNUM *r;
-	const BIGNUM *aa;
 	BN_MONT_CTX *mont=NULL;
 
 	int numPowers;
 	unsigned char *powerbufFree=NULL;
 	int powerbufLen = 0;
 	unsigned char *powerbuf=NULL;
-	BIGNUM *computeTemp=NULL, *am=NULL;
+	BIGNUM tmp, am;
 
 	bn_check_top(a);
 	bn_check_top(p);
@@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 		return ret;
 		}
 
- 	/* Initialize BIGNUM context and allocate intermediate result */
 	BN_CTX_start(ctx);
-	r = BN_CTX_get(ctx);
-	if (r == NULL) goto err;
 
 	/* Allocate a montgomery context if it was not supplied by the caller.
 	 * If this is not done, things will break in the montgomery part.
@@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
 	/* Get the window size to use with size of p. */
 	window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+	if (window==6 && bits<=1024) window=5;	/* ~5% improvement of 2048-bit RSA sign */
+#endif
 
 	/* Allocate a buffer large enough to hold all of the pre-computed
-	 * powers of a.
+	 * powers of am, am itself and tmp.
 	 */
 	numPowers = 1 << window;
-	powerbufLen = sizeof(m->d[0])*top*numPowers;
+	powerbufLen = sizeof(m->d[0])*(top*numPowers +
+				((2*top)>numPowers?(2*top):numPowers));
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+	else
+#endif
 	if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
 		goto err;
 		
 	powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
 	memset(powerbuf, 0, powerbufLen);
 
- 	/* Initialize the intermediate result. Do this early to save double conversion,
-	 * once each for a^0 and intermediate result.
-	 */
- 	if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
+#ifdef alloca
+	if (powerbufLen < 3072)
+		powerbufFree = NULL;
+#endif
 
-	/* Initialize computeTemp as a^1 with montgomery precalcs */
-	computeTemp = BN_CTX_get(ctx);
-	am = BN_CTX_get(ctx);
-	if (computeTemp==NULL || am==NULL) goto err;
+	/* lay down tmp and am right after powers table */
+	tmp.d     = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+	am.d      = tmp.d + top;
+	tmp.top   = am.top  = 0;
+	tmp.dmax  = am.dmax = top;
+	tmp.neg   = am.neg  = 0;
+	tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+	/* prepare a^0 in Montgomery domain */
+#if 1
+ 	if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx))	goto err;
+#else
+	tmp.d[0] = (0-m->d[0])&BN_MASK2;	/* 2^(top*BN_BITS2) - m */
+	for (i=1;i<top;i++)
+		tmp.d[i] = (~m->d[i])&BN_MASK2;
+	tmp.top = top;
+#endif
 
+	/* prepare a^1 in Montgomery domain */
 	if (a->neg || BN_ucmp(a,m) >= 0)
 		{
-		if (!BN_mod(am,a,m,ctx))
-			goto err;
-		aa= am;
+		if (!BN_mod(&am,a,m,ctx))			goto err;
+		if (!BN_to_montgomery(&am,&am,mont,ctx))	goto err;
 		}
-	else
-		aa=a;
-	if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
-	if (!BN_copy(computeTemp, am)) goto err;
-	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
+	else	if (!BN_to_montgomery(&am,a,mont,ctx))		goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+    /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+     * specifically optimization of cache-timing attack countermeasures
+     * and pre-computation optimization. */
+
+    /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+     * 512-bit RSA is hardly relevant, we omit it to spare size... */ 
+    if (window==5)
+	{
+	void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+			const void *table,const BN_ULONG *np,
+			const BN_ULONG *n0,int num,int power);
+	void bn_scatter5(const BN_ULONG *inp,size_t num,
+			void *table,size_t power);
+	void bn_gather5(BN_ULONG *out,size_t num,
+			void *table,size_t power);
+
+	BN_ULONG *np=mont->N.d, *n0=mont->n0;
+
+	/* BN_to_montgomery can contaminate words above .top
+	 * [in BN_DEBUG[_DEBUG] build]... */
+	for (i=am.top; i<top; i++)	am.d[i]=0;
+	for (i=tmp.top; i<top; i++)	tmp.d[i]=0;
+
+	bn_scatter5(tmp.d,top,powerbuf,0);
+	bn_scatter5(am.d,am.top,powerbuf,1);
+	bn_mul_mont(tmp.d,am.d,am.d,np,n0,top);
+	bn_scatter5(tmp.d,top,powerbuf,2);
+
+#if 0
+	for (i=3; i<32; i++)
+		{
+		/* Calculate a^i = a^(i-1) * a */
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#else
+	/* same as above, but uses squaring for 1/2 of operations */
+	for (i=4; i<32; i*=2)
+		{
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+	for (i=3; i<8; i+=2)
+		{
+		int j;
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		for (j=2*i; j<32; j*=2)
+			{
+			bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+			bn_scatter5(tmp.d,top,powerbuf,j);
+			}
+		}
+	for (; i<16; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_scatter5(tmp.d,top,powerbuf,2*i);
+		}
+	for (; i<32; i+=2)
+		{
+		bn_mul_mont_gather5(tmp.d,am.d,powerbuf,np,n0,top,i-1);
+		bn_scatter5(tmp.d,top,powerbuf,i);
+		}
+#endif
+	bits--;
+	for (wvalue=0, i=bits%5; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	bn_gather5(tmp.d,top,powerbuf,wvalue);
+
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+	while (bits >= 0)
+		{
+		for (wvalue=0, i=0; i<5; i++,bits--)
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top);
+		bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue);
+		}
+
+	tmp.top=top;
+	bn_correct_top(&tmp);
+	}
+    else
+#endif
+	{
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err;
+	if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am,  top, powerbuf, 1, numPowers)) goto err;
 
 	/* If the window size is greater than 1, then calculate
 	 * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
@@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 	 */
 	if (window > 1)
 		{
-		for (i=2; i<numPowers; i++)
+		if (!BN_mod_mul_montgomery(&tmp,&am,&am,mont,ctx))	goto err;
+		if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 2, numPowers)) goto err;
+		for (i=3; i<numPowers; i++)
 			{
 			/* Calculate a^i = a^(i-1) * a */
-			if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+			if (!BN_mod_mul_montgomery(&tmp,&am,&tmp,mont,ctx))
 				goto err;
-			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+			if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, i, numPowers)) goto err;
 			}
 		}
 
- 	/* Adjust the number of bits up to a multiple of the window size.
- 	 * If the exponent length is not a multiple of the window size, then
- 	 * this pads the most significant bits with zeros to normalize the
- 	 * scanning loop to there's no special cases.
- 	 *
- 	 * * NOTE: Making the window size a power of two less than the native
-	 * * word size ensures that the padded bits won't go past the last
- 	 * * word in the internal BIGNUM structure. Going past the end will
- 	 * * still produce the correct result, but causes a different branch
- 	 * * to be taken in the BN_is_bit_set function.
- 	 */
- 	bits = ((bits+window-1)/window)*window;
- 	idx=bits-1;	/* The top bit of the window */
-
- 	/* Scan the exponent one window at a time starting from the most
- 	 * significant bits.
- 	 */
- 	while (idx >= 0)
+	bits--;
+	for (wvalue=0, i=bits%window; i>=0; i--,bits--)
+		wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+	if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err;
+ 
+	/* Scan the exponent one window at a time starting from the most
+	 * significant bits.
+	 */
+ 	while (bits >= 0)
   		{
  		wvalue=0; /* The 'value' of the window */
  		
  		/* Scan the window, squaring the result as we go */
- 		for (i=0; i<window; i++,idx--)
+ 		for (i=0; i<window; i++,bits--)
  			{
-			if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))	goto err;
-			wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+			if (!BN_mod_mul_montgomery(&tmp,&tmp,&tmp,mont,ctx))	goto err;
+			wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
   			}
  		
 		/* Fetch the appropriate pre-computed value from the pre-buf */
-		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+		if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&am, top, powerbuf, wvalue, numPowers)) goto err;
 
  		/* Multiply the result into the intermediate result */
- 		if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+ 		if (!BN_mod_mul_montgomery(&tmp,&tmp,&am,mont,ctx)) goto err;
   		}
+	}
 
  	/* Convert the final result from montgomery to standard format */
-	if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
+	if (!BN_from_montgomery(rr,&tmp,mont,ctx)) goto err;
 	ret=1;
 err:
 	if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont);
 	if (powerbuf!=NULL)
 		{
 		OPENSSL_cleanse(powerbuf,powerbufLen);
-		OPENSSL_free(powerbufFree);
+		if (powerbufFree) OPENSSL_free(powerbufFree);
 		}
- 	if (am!=NULL) BN_clear(am);
- 	if (computeTemp!=NULL) BN_clear(computeTemp);
 	BN_CTX_end(ctx);
 	return(ret);
 	}
@@ -988,4 +1095,3 @@ err:
 	bn_check_top(r);
 	return(ret);
 	}
-
diff --git a/src/lib/libcrypto/bn/bn_gf2m.c b/src/lib/libcrypto/bn/bn_gf2m.c
index 432a3aa338..8a4dc20ad9 100644
--- a/src/lib/libcrypto/bn/bn_gf2m.c
+++ b/src/lib/libcrypto/bn/bn_gf2m.c
@@ -94,6 +94,8 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 /* Maximum number of iterations before BN_GF2m_mod_solve_quad_arr should fail. */
 #define MAX_ITERATIONS 50
 
@@ -122,6 +124,7 @@ static const BN_ULONG SQR_tb[16] =
     SQR_tb[(w) >>  4 & 0xF] <<  8 | SQR_tb[(w)       & 0xF]
 #endif
 
+#if !defined(OPENSSL_BN_ASM_GF2m)
 /* Product of two polynomials a, b each with degree < BN_BITS2 - 1,
  * result is a polynomial r with degree < 2 * BN_BITS - 1
  * The caller MUST ensure that the variables have the right amount
@@ -216,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c
 	r[2] ^= m1 ^ r[1] ^ r[3];  /* h0 ^= m1 ^ l1 ^ h1; */
 	r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0;  /* l1 ^= l0 ^ h0 ^ m0; */
 	}
-
+#else
+void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+#endif 
 
 /* Add polynomials a and b and store result in r; r could be a or b, a and b 
  * could be equal; r is the bitwise XOR of a and b.
@@ -360,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
 int	BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
 	{
 	int ret = 0;
-	const int max = BN_num_bits(p) + 1;
-	int *arr=NULL;
+	int arr[6];
 	bn_check_top(a);
 	bn_check_top(p);
-	if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
-	ret = BN_GF2m_poly2arr(p, arr, max);
-	if (!ret || ret > max)
+	ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+	if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0])))
 		{
 		BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
-		goto err;
+		return 0;
 		}
 	ret = BN_GF2m_mod_arr(r, a, arr);
 	bn_check_top(r);
-err:
-	if (arr) OPENSSL_free(arr);
 	return ret;
 	}
 
@@ -521,7 +522,7 @@ err:
  */
 int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 	{
-	BIGNUM *b, *c, *u, *v, *tmp;
+	BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp;
 	int ret = 0;
 
 	bn_check_top(a);
@@ -529,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 
 	BN_CTX_start(ctx);
 	
-	b = BN_CTX_get(ctx);
-	c = BN_CTX_get(ctx);
-	u = BN_CTX_get(ctx);
-	v = BN_CTX_get(ctx);
-	if (v == NULL) goto err;
+	if ((b = BN_CTX_get(ctx))==NULL) goto err;
+	if ((c = BN_CTX_get(ctx))==NULL) goto err;
+	if ((u = BN_CTX_get(ctx))==NULL) goto err;
+	if ((v = BN_CTX_get(ctx))==NULL) goto err;
 
-	if (!BN_one(b)) goto err;
 	if (!BN_GF2m_mod(u, a, p)) goto err;
-	if (!BN_copy(v, p)) goto err;
-
 	if (BN_is_zero(u)) goto err;
 
+	if (!BN_copy(v, p)) goto err;
+#if 0
+	if (!BN_one(b)) goto err;
+
 	while (1)
 		{
 		while (!BN_is_odd(u))
@@ -565,13 +566,89 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 		if (!BN_GF2m_add(u, u, v)) goto err;
 		if (!BN_GF2m_add(b, b, c)) goto err;
 		}
+#else
+	{
+	int i,	ubits = BN_num_bits(u),
+		vbits = BN_num_bits(v),	/* v is copy of p */
+		top = p->top;
+	BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+	bn_wexpand(u,top);	udp = u->d;
+				for (i=u->top;i<top;i++) udp[i] = 0;
+				u->top = top;
+	bn_wexpand(b,top);	bdp = b->d;
+				bdp[0] = 1;
+				for (i=1;i<top;i++) bdp[i] = 0;
+				b->top = top;
+	bn_wexpand(c,top);	cdp = c->d;
+				for (i=0;i<top;i++) cdp[i] = 0;
+				c->top = top;
+	vdp = v->d;	/* It pays off to "cache" *->d pointers, because
+			 * it allows optimizer to be more aggressive.
+			 * But we don't have to "cache" p->d, because *p
+			 * is declared 'const'... */
+	while (1)
+		{
+		while (ubits && !(udp[0]&1))
+			{
+			BN_ULONG u0,u1,b0,b1,mask;
+
+			u0   = udp[0];
+			b0   = bdp[0];
+			mask = (BN_ULONG)0-(b0&1);
+			b0  ^= p->d[0]&mask;
+			for (i=0;i<top-1;i++)
+				{
+				u1 = udp[i+1];
+				udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+				u0 = u1;
+				b1 = bdp[i+1]^(p->d[i+1]&mask);
+				bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+				b0 = b1;
+				}
+			udp[i] = u0>>1;
+			bdp[i] = b0>>1;
+			ubits--;
+			}
 
+		if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+		if (ubits<vbits)
+			{
+			i = ubits; ubits = vbits; vbits = i;
+			tmp = u; u = v; v = tmp;
+			tmp = b; b = c; c = tmp;
+			udp = vdp; vdp = v->d;
+			bdp = cdp; cdp = c->d;
+			}
+		for(i=0;i<top;i++)
+			{
+			udp[i] ^= vdp[i];
+			bdp[i] ^= cdp[i];
+			}
+		if (ubits==vbits)
+			{
+			BN_ULONG ul;
+			int utop = (ubits-1)/BN_BITS2;
+
+			while ((ul=udp[utop])==0 && utop) utop--;
+			ubits = utop*BN_BITS2 + BN_num_bits_word(ul);
+			}
+		}
+	bn_correct_top(b);
+	}
+#endif
 
 	if (!BN_copy(r, b)) goto err;
 	bn_check_top(r);
 	ret = 1;
 
 err:
+#ifdef BN_DEBUG /* BN_CTX_end would complain about the expanded form */
+        bn_correct_top(c);
+        bn_correct_top(u);
+        bn_correct_top(v);
+#endif
   	BN_CTX_end(ctx);
 	return ret;
 	}
@@ -1033,3 +1110,4 @@ int BN_GF2m_arr2poly(const int p[], BIGNUM *a)
 	return 1;
 	}
 
+#endif
diff --git a/src/lib/libcrypto/bn/bn_lcl.h b/src/lib/libcrypto/bn/bn_lcl.h
index 8e5e98e3f2..eecfd8cc99 100644
--- a/src/lib/libcrypto/bn/bn_lcl.h
+++ b/src/lib/libcrypto/bn/bn_lcl.h
@@ -238,7 +238,7 @@ extern "C" {
 #  if defined(__DECC)
 #   include <c_asm.h>
 #   define BN_UMULT_HIGH(a,b)	(BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
-#  elif defined(__GNUC__)
+#  elif defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("umulh	%1,%2,%0"	\
@@ -247,7 +247,7 @@ extern "C" {
 	ret;			})
 #  endif	/* compiler */
 # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret;		\
 	asm ("mulhdu	%0,%1,%2"	\
@@ -257,7 +257,7 @@ extern "C" {
 #  endif	/* compiler */
 # elif (defined(__x86_64) || defined(__x86_64__)) && \
        (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
-#  if defined(__GNUC__)
+#  if defined(__GNUC__) && __GNUC__>=2
 #   define BN_UMULT_HIGH(a,b)	({	\
 	register BN_ULONG ret,discard;	\
 	asm ("mulq	%3"		\
@@ -280,6 +280,19 @@ extern "C" {
 #   define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
 #   define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
 #  endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+#  if defined(__GNUC__) && __GNUC__>=2
+#   define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	asm ("dmultu	%1,%2"		\
+	     : "=h"(ret)		\
+	     : "r"(a), "r"(b) : "l");	\
+	ret;			})
+#   define BN_UMULT_LOHI(low,high,a,b)	\
+	asm ("dmultu	%2,%3"		\
+	     : "=l"(low),"=h"(high)	\
+	     : "r"(a), "r"(b));
+#  endif
 # endif		/* cpu */
 #endif		/* OPENSSL_NO_ASM */
 
@@ -459,6 +472,10 @@ extern "C" {
 	}
 #endif /* !BN_LLONG */
 
+#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS)
+#undef bn_div_words
+#endif
+
 void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb);
 void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
 void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b);
diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c
index 5470fbe6ef..7a5676de69 100644
--- a/src/lib/libcrypto/bn/bn_lib.c
+++ b/src/lib/libcrypto/bn/bn_lib.c
@@ -139,25 +139,6 @@ const BIGNUM *BN_value_one(void)
 	return(&const_one);
 	}
 
-char *BN_options(void)
-	{
-	static int init=0;
-	static char data[16];
-
-	if (!init)
-		{
-		init++;
-#ifdef BN_LLONG
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
-#else
-		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
-			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
-#endif
-		}
-	return(data);
-	}
-
 int BN_num_bits_word(BN_ULONG l)
 	{
 	static const unsigned char bits[256]={
diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c
index 1a866880f5..427b5cf4df 100644
--- a/src/lib/libcrypto/bn/bn_mont.c
+++ b/src/lib/libcrypto/bn/bn_mont.c
@@ -177,31 +177,26 @@ err:
 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 	{
 	BIGNUM *n;
-	BN_ULONG *ap,*np,*rp,n0,v,*nrp;
-	int al,nl,max,i,x,ri;
+	BN_ULONG *ap,*np,*rp,n0,v,carry;
+	int nl,max,i;
 
 	n= &(mont->N);
-	/* mont->ri is the size of mont->N in bits (rounded up
-	   to the word size) */
-	al=ri=mont->ri/BN_BITS2;
-
 	nl=n->top;
-	if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+	if (nl == 0) { ret->top=0; return(1); }
 
-	max=(nl+al+1); /* allow for overflow (no?) XXX */
+	max=(2*nl); /* carry is stored separately */
 	if (bn_wexpand(r,max) == NULL) return(0);
 
 	r->neg^=n->neg;
 	np=n->d;
 	rp=r->d;
-	nrp= &(r->d[nl]);
 
 	/* clear the top words of T */
 #if 1
 	for (i=r->top; i<max; i++) /* memset? XXX */
-		r->d[i]=0;
+		rp[i]=0;
 #else
-	memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
+	memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
 #endif
 
 	r->top=max;
@@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 #ifdef BN_COUNT
 	fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
 #endif
-	for (i=0; i<nl; i++)
+	for (carry=0, i=0; i<nl; i++, rp++)
 		{
 #ifdef __TANDEM
                 {
@@ -228,61 +223,33 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 #else
 		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
 #endif
-		nrp++;
-		rp++;
-		if (((nrp[-1]+=v)&BN_MASK2) >= v)
-			continue;
-		else
-			{
-			if (((++nrp[0])&BN_MASK2) != 0) continue;
-			if (((++nrp[1])&BN_MASK2) != 0) continue;
-			for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
-			}
-		}
-	bn_correct_top(r);
-
-	/* mont->ri will be a multiple of the word size and below code
-	 * is kind of BN_rshift(ret,r,mont->ri) equivalent */
-	if (r->top <= ri)
-		{
-		ret->top=0;
-		return(1);
+		v = (v+carry+rp[nl])&BN_MASK2;
+		carry |= (v != rp[nl]);
+		carry &= (v <= rp[nl]);
+		rp[nl]=v;
 		}
-	al=r->top-ri;
 
-#define BRANCH_FREE 1
-#if BRANCH_FREE
-	if (bn_wexpand(ret,ri) == NULL) return(0);
-	x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
-	ret->top=x=(ri&~x)|(al&x);	/* min(ri,al) */
+	if (bn_wexpand(ret,nl) == NULL) return(0);
+	ret->top=nl;
 	ret->neg=r->neg;
 
 	rp=ret->d;
-	ap=&(r->d[ri]);
+	ap=&(r->d[nl]);
 
+#define BRANCH_FREE 1
+#if BRANCH_FREE
 	{
-	size_t m1,m2;
-
-	v=bn_sub_words(rp,ap,np,ri);
-	/* this ----------------^^ works even in al<ri case
-	 * thanks to zealous zeroing of top of the vector in the
-	 * beginning. */
+	BN_ULONG *nrp;
+	size_t m;
 
-	/* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
-	/* in other words if subtraction result is real, then
+	v=bn_sub_words(rp,ap,np,nl)-carry;
+	/* if subtraction result is real, then
 	 * trick unconditional memcpy below to perform in-place
 	 * "refresh" instead of actual copy. */
-	m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);	/* al<ri */
-	m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);	/* al>ri */
-	m1|=m2;			/* (al!=ri) */
-	m1|=(0-(size_t)v);	/* (al!=ri || v) */
-	m1&=~m2;		/* (al!=ri || v) && !al>ri */
-	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1));
-	}
+	m=(0-(size_t)v);
+	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
 
-	/* 'i<ri' is chosen to eliminate dependency on input data, even
-	 * though it results in redundant copy in al<ri case. */
-	for (i=0,ri-=4; i<ri; i+=4)
+	for (i=0,nl-=4; i<nl; i+=4)
 		{
 		BN_ULONG t1,t2,t3,t4;
 		
@@ -295,40 +262,15 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 		rp[i+2]=t3;
 		rp[i+3]=t4;
 		}
-	for (ri+=4; i<ri; i++)
+	for (nl+=4; i<nl; i++)
 		rp[i]=nrp[i], ap[i]=0;
-	bn_correct_top(r);
-	bn_correct_top(ret);
+	}
 #else
-	if (bn_wexpand(ret,al) == NULL) return(0);
-	ret->top=al;
-	ret->neg=r->neg;
-
-	rp=ret->d;
-	ap=&(r->d[ri]);
-	al-=4;
-	for (i=0; i<al; i+=4)
-		{
-		BN_ULONG t1,t2,t3,t4;
-		
-		t1=ap[i+0];
-		t2=ap[i+1];
-		t3=ap[i+2];
-		t4=ap[i+3];
-		rp[i+0]=t1;
-		rp[i+1]=t2;
-		rp[i+2]=t3;
-		rp[i+3]=t4;
-		}
-	al+=4;
-	for (; i<al; i++)
-		rp[i]=ap[i];
-
-	if (BN_ucmp(ret, &(mont->N)) >= 0)
-		{
-		if (!BN_usub(ret,ret,&(mont->N))) return(0);
-		}
+	if (bn_sub_words (rp,ap,np,nl)-carry)
+		memcpy(rp,ap,nl*sizeof(BN_ULONG));
 #endif
+	bn_correct_top(r);
+	bn_correct_top(ret);
 	bn_check_top(ret);
 
 	return(1);
diff --git a/src/lib/libcrypto/bn/bn_nist.c b/src/lib/libcrypto/bn/bn_nist.c
index c6de032696..43caee4770 100644
--- a/src/lib/libcrypto/bn/bn_nist.c
+++ b/src/lib/libcrypto/bn/bn_nist.c
@@ -319,6 +319,13 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 						:(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l)))
 #define bn_32_set_0(to, n)		(((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0));
 #define bn_cp_32(to,n,from,m)		((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n)
+# if defined(L_ENDIAN)
+#  if defined(__arch64__)
+#   define NIST_INT64 long
+#  else
+#   define NIST_INT64 long long
+#  endif
+# endif
 #else
 #define bn_cp_64(to, n, from, m) \
 	{ \
@@ -330,13 +337,15 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top)
 	bn_32_set_0(to, (n)*2); \
 	bn_32_set_0(to, (n)*2+1); \
 	}
-#if BN_BITS2 == 32
 #define bn_cp_32(to, n, from, m)	(to)[n] = (m>=0)?((from)[m]):0;
 #define bn_32_set_0(to, n)		(to)[n] = (BN_ULONG)0;
-#endif
+# if defined(_WIN32) && !defined(__GNUC__)
+#  define NIST_INT64 __int64
+# elif defined(BN_LLONG)
+#  define NIST_INT64 long long
+# endif
 #endif /* BN_BITS2 != 64 */
 
-
 #define nist_set_192(to, from, a1, a2, a3) \
 	{ \
 	bn_cp_64(to, 0, from, (a3) - 3) \
@@ -350,9 +359,11 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int      top = a->top, i;
 	int      carry;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_192_TOP],
-	         buf[BN_NIST_192_TOP],
-		 c_d[BN_NIST_192_TOP],
+	union	{
+		BN_ULONG	bn[BN_NIST_192_TOP];
+		unsigned int	ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_192_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	static const BIGNUM _bignum_nist_p_192_sqr = {
@@ -385,15 +396,48 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc  = rp[0];	acc += bp[3*2-6];
+			acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[3*2-5];
+			acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32;
 
-	nist_set_192(t_d, buf, 0, 3, 3);
+	acc += rp[2];	acc += bp[3*2-6];
+			acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[3*2-5];
+			acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[4*2-6];
+			acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[4*2-5];
+			acc += bp[5*2-5]; rp[5] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_192_TOP];
+
+	nist_set_192(t_d, buf.bn, 0, 3, 3);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 4, 4, 0);
+	nist_set_192(t_d, buf.bn, 4, 4, 0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-	nist_set_192(t_d, buf, 5, 5, 5)
+	nist_set_192(t_d, buf.bn, 5, 5, 5)
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP);
-
+	}
+#endif
 	if (carry > 0)
 		carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP);
 	else
@@ -435,8 +479,7 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	top = a->top, i;
 	int	carry;
 	BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_224_TOP],
-	         buf[BN_NIST_224_TOP],
+	BN_ULONG buf[BN_NIST_224_TOP],
 		 c_d[BN_NIST_224_TOP],
 		*res;
 	PTR_SIZE_INT mask;
@@ -474,14 +517,54 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 #if BN_BITS2==64
 	/* copy upper 256 bits of 448 bit number ... */
-	nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
+	nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP);
 	/* ... and right shift by 32 to obtain upper 224 bits */
-	nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8);
+	nist_set_224(buf, c_d, 14, 13, 12, 11, 10, 9, 8);
 	/* truncate lower part to 224 bits too */
 	r_d[BN_NIST_224_TOP-1] &= BN_MASK2l;
 #else
 	nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP);
 #endif
+
+#if defined(NIST_INT64) && BN_BITS2!=64
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf;
+
+	acc  = rp[0];	acc -= bp[7-7];
+			acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc -= bp[8-7];
+			acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc -= bp[9-7];
+			acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[7-7];
+			acc += bp[11-7];
+			acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32;
+
+	acc += rp[4];	acc += bp[8-7];
+			acc += bp[12-7];
+			acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[9-7];
+			acc += bp[13-7];
+			acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[10-7];
+			acc -= bp[13-7]; rp[6] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+# if BN_BITS2==64
+	rp[7] = carry;
+# endif
+	}	
+#else
+	{
+	BN_ULONG t_d[BN_NIST_224_TOP];
+
 	nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0);
 	carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP);
 	nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0);
@@ -493,6 +576,8 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 
 #if BN_BITS2==64
 	carry = (int)(r_d[BN_NIST_224_TOP-1]>>32);
+#endif
+	}
 #endif
 	u.f = bn_sub_words;
 	if (carry > 0)
@@ -548,9 +633,11 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *a_d = a->d, *r_d;
-	BN_ULONG t_d[BN_NIST_256_TOP],
-	         buf[BN_NIST_256_TOP],
-		 c_d[BN_NIST_256_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_256_TOP];
+		unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_256_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -584,12 +671,87 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[8-8];
+			acc += bp[9-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[9-8];
+			acc += bp[10-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[10-8];
+			acc += bp[11-8];
+			acc -= bp[13-8];
+			acc -= bp[14-8];
+			acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[11-8];
+			acc += bp[11-8];
+			acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc -= bp[15-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[12-8];
+			acc += bp[12-8];
+			acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc -= bp[9-8];
+			acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[13-8];
+			acc += bp[13-8];
+			acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc -= bp[10-8];
+			acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[6];	acc += bp[14-8];
+			acc += bp[14-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[14-8];
+			acc += bp[13-8];
+			acc -= bp[8-8];
+			acc -= bp[9-8];  rp[6] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[7];	acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[15-8];
+			acc += bp[8 -8];
+			acc -= bp[10-8];
+			acc -= bp[11-8];
+			acc -= bp[12-8];
+			acc -= bp[13-8]; rp[7] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_256_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0);
+	nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0);
 	/*S2*/
-	nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0);
+	nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0);
 	carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP);
 	/* left shift */
 		{
@@ -607,24 +769,26 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 		}
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S3*/
-	nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8);
+	nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*S4*/
-	nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9);
+	nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D1*/
-	nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11);
+	nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D2*/
-	nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12);
+	nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D3*/
-	nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13);
+	nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 	/*D4*/
-	nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14);
+	nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)
@@ -672,9 +836,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	int	i, top = a->top;
 	int	carry = 0;
 	register BN_ULONG *r_d, *a_d = a->d;
-	BN_ULONG t_d[BN_NIST_384_TOP],
-	         buf[BN_NIST_384_TOP],
-		 c_d[BN_NIST_384_TOP],
+	union	{
+		BN_ULONG bn[BN_NIST_384_TOP];
+		unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)];
+		} buf;
+	BN_ULONG c_d[BN_NIST_384_TOP],
 		*res;
 	PTR_SIZE_INT mask;
 	union { bn_addsub_f f; PTR_SIZE_INT p; } u;
@@ -709,10 +875,100 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	else
 		r_d = a_d;
 
-	nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+	nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP);
+
+#if defined(NIST_INT64)
+	{
+	NIST_INT64		acc;	/* accumulator */
+	unsigned int		*rp=(unsigned int *)r_d;
+	const unsigned int	*bp=(const unsigned int *)buf.ui;
+
+	acc = rp[0];	acc += bp[12-12];
+			acc += bp[21-12];
+			acc += bp[20-12];
+			acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[1];	acc += bp[13-12];
+			acc += bp[22-12];
+			acc += bp[23-12];
+			acc -= bp[12-12];
+			acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[2];	acc += bp[14-12];
+			acc += bp[23-12];
+			acc -= bp[13-12];
+			acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[3];	acc += bp[15-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[21-12];
+			acc -= bp[14-12];
+			acc -= bp[22-12];
+			acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[4];	acc += bp[21-12];
+			acc += bp[21-12];
+			acc += bp[16-12];
+			acc += bp[13-12];
+			acc += bp[12-12];
+			acc += bp[20-12];
+			acc += bp[22-12];
+			acc -= bp[15-12];
+			acc -= bp[23-12];
+			acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32;
+
+	acc += rp[5];	acc += bp[22-12];
+			acc += bp[22-12];
+			acc += bp[17-12];
+			acc += bp[14-12];
+			acc += bp[13-12];
+			acc += bp[21-12];
+			acc += bp[23-12];
+			acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[6];	acc += bp[23-12];
+			acc += bp[23-12];
+			acc += bp[18-12];
+			acc += bp[15-12];
+			acc += bp[14-12];
+			acc += bp[22-12];
+			acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[7];	acc += bp[19-12];
+			acc += bp[16-12];
+			acc += bp[15-12];
+			acc += bp[23-12];
+			acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[8];	acc += bp[20-12];
+			acc += bp[17-12];
+			acc += bp[16-12];
+			acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[9];	acc += bp[21-12];
+			acc += bp[18-12];
+			acc += bp[17-12];
+			acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[10];	acc += bp[22-12];
+			acc += bp[19-12];
+			acc += bp[18-12];
+			acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32;
+			
+	acc += rp[11];	acc += bp[23-12];
+			acc += bp[20-12];
+			acc += bp[19-12];
+			acc -= bp[22-12]; rp[11] = (unsigned int)acc;
+
+	carry = (int)(acc>>32);
+	}
+#else
+	{
+	BN_ULONG t_d[BN_NIST_384_TOP];
 
 	/*S1*/
-	nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
+	nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4);
 		/* left shift */
 		{
 		register BN_ULONG *ap,t,c;
@@ -729,29 +985,31 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field,
 	carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), 
 		t_d, BN_NIST_256_TOP);
 	/*S2 */
-	carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP);
+	carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP);
 	/*S3*/
-	nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21);
+	nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S4*/
-	nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0);
+	nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S5*/
-	nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0);
+	nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*S6*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20);
 	carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D1*/
-	nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23);
+	nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D2*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 	/*D3*/
-	nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0);
+	nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0);
 	carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP);
 
+	}
+#endif
 	/* see BN_nist_mod_224 for explanation */
 	u.f = bn_sub_words;
 	if (carry > 0)
diff --git a/src/lib/libcrypto/bn/bn_print.c b/src/lib/libcrypto/bn/bn_print.c
index bebb466d08..1743b6a7e2 100644
--- a/src/lib/libcrypto/bn/bn_print.c
+++ b/src/lib/libcrypto/bn/bn_print.c
@@ -357,3 +357,22 @@ end:
 	return(ret);
 	}
 #endif
+
+char *BN_options(void)
+	{
+	static int init=0;
+	static char data[16];
+
+	if (!init)
+		{
+		init++;
+#ifdef BN_LLONG
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8);
+#else
+		BIO_snprintf(data,sizeof data,"bn(%d,%d)",
+			     (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8);
+#endif
+		}
+	return(data);
+	}
diff --git a/src/lib/libcrypto/bn/bn_shift.c b/src/lib/libcrypto/bn/bn_shift.c
index c4d301afc4..a6fca2c424 100644
--- a/src/lib/libcrypto/bn/bn_shift.c
+++ b/src/lib/libcrypto/bn/bn_shift.c
@@ -99,7 +99,7 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a)
 int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 	{
 	BN_ULONG *ap,*rp,t,c;
-	int i;
+	int i,j;
 
 	bn_check_top(r);
 	bn_check_top(a);
@@ -109,22 +109,25 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a)
 		BN_zero(r);
 		return(1);
 		}
+	i = a->top;
+	ap= a->d;
+	j = i-(ap[i-1]==1);
 	if (a != r)
 		{
-		if (bn_wexpand(r,a->top) == NULL) return(0);
-		r->top=a->top;
+		if (bn_wexpand(r,j) == NULL) return(0);
 		r->neg=a->neg;
 		}
-	ap=a->d;
 	rp=r->d;
-	c=0;
-	for (i=a->top-1; i>=0; i--)
+	t=ap[--i];
+	c=(t&1)?BN_TBIT:0;
+	if (t>>=1) rp[i]=t;
+	while (i>0)
 		{
-		t=ap[i];
+		t=ap[--i];
 		rp[i]=((t>>1)&BN_MASK2)|c;
 		c=(t&1)?BN_TBIT:0;
 		}
-	bn_correct_top(r);
+	r->top=j;
 	bn_check_top(r);
 	return(1);
 	}
@@ -182,10 +185,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 		BN_zero(r);
 		return(1);
 		}
+	i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2;
 	if (r != a)
 		{
 		r->neg=a->neg;
-		if (bn_wexpand(r,a->top-nw+1) == NULL) return(0);
+		if (bn_wexpand(r,i) == NULL) return(0);
 		}
 	else
 		{
@@ -196,7 +200,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 	f= &(a->d[nw]);
 	t=r->d;
 	j=a->top-nw;
-	r->top=j;
+	r->top=i;
 
 	if (rb == 0)
 		{
@@ -212,9 +216,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n)
 			l= *(f++);
 			*(t++) =(tmp|(l<<lb))&BN_MASK2;
 			}
-		*(t++) =(l>>rb)&BN_MASK2;
+		if ((l = (l>>rb)&BN_MASK2)) *(t) = l;
 		}
-	bn_correct_top(r);
 	bn_check_top(r);
 	return(1);
 	}
diff --git a/src/lib/libcrypto/buffer/buf_str.c b/src/lib/libcrypto/buffer/buf_str.c
index 28dd1e401e..151f5ea971 100644
--- a/src/lib/libcrypto/buffer/buf_str.c
+++ b/src/lib/libcrypto/buffer/buf_str.c
@@ -1,56 +1,59 @@
-/* crypto/buffer/buf_str.c */
-/* ====================================================================
- * Copyright (c) 2007 The OpenSSL Project.  All rights reserved.
+/* crypto/buffer/buffer.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
  *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer. 
- *
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- *
- * 3. All advertising materials mentioning features or use of this
- *    software must display the following acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
- *
- * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
- *    endorse or promote products derived from this software without
- *    prior written permission. For written permission, please contact
- *    licensing@OpenSSL.org.
- *
- * 5. Products derived from this software may not be called "OpenSSL"
- *    nor may "OpenSSL" appear in their names without prior written
- *    permission of the OpenSSL Project.
- *
- * 6. Redistributions of any form whatsoever must retain the following
- *    acknowledgment:
- *    "This product includes software developed by the OpenSSL Project
- *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
- *
- * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
- * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- * ====================================================================
- *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com).
- *
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
  */
 
 #include <stdio.h>
diff --git a/src/lib/libcrypto/buffer/buffer.c b/src/lib/libcrypto/buffer/buffer.c
index 620ea8d536..d7aa79ad7f 100644
--- a/src/lib/libcrypto/buffer/buffer.c
+++ b/src/lib/libcrypto/buffer/buffer.c
@@ -60,6 +60,11 @@
 #include "cryptlib.h"
 #include <openssl/buffer.h>
 
+/* LIMIT_BEFORE_EXPANSION is the maximum n such that (n+3)/3*4 < 2**31. That
+ * function is applied in several functions in this file and this limit ensures
+ * that the result fits in an int. */
+#define LIMIT_BEFORE_EXPANSION 0x5ffffffc
+
 BUF_MEM *BUF_MEM_new(void)
 	{
 	BUF_MEM *ret;
@@ -105,6 +110,12 @@ int BUF_MEM_grow(BUF_MEM *str, size_t len)
 		str->length=len;
 		return(len);
 		}
+	/* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+	if (len > LIMIT_BEFORE_EXPANSION)
+		{
+		BUFerr(BUF_F_BUF_MEM_GROW,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
 	n=(len+3)/3*4;
 	if (str->data == NULL)
 		ret=OPENSSL_malloc(n);
@@ -142,6 +153,12 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
 		str->length=len;
 		return(len);
 		}
+	/* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */
+	if (len > LIMIT_BEFORE_EXPANSION)
+		{
+		BUFerr(BUF_F_BUF_MEM_GROW_CLEAN,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
 	n=(len+3)/3*4;
 	if (str->data == NULL)
 		ret=OPENSSL_malloc(n);
@@ -162,64 +179,6 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len)
 	return(len);
 	}
 
-char *BUF_strdup(const char *str)
-	{
-	if (str == NULL) return(NULL);
-	return BUF_strndup(str, strlen(str));
-	}
-
-char *BUF_strndup(const char *str, size_t siz)
-	{
-	char *ret;
-
-	if (str == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz+1);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	BUF_strlcpy(ret,str,siz+1);
-	return(ret);
-	}
-
-void *BUF_memdup(const void *data, size_t siz)
-	{
-	void *ret;
-
-	if (data == NULL) return(NULL);
-
-	ret=OPENSSL_malloc(siz);
-	if (ret == NULL) 
-		{
-		BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE);
-		return(NULL);
-		}
-	return memcpy(ret, data, siz);
-	}	
-
-size_t BUF_strlcpy(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 1 && *src; size--)
-		{
-		*dst++ = *src++;
-		l++;
-		}
-	if (size)
-		*dst = '\0';
-	return l + strlen(src);
-	}
-
-size_t BUF_strlcat(char *dst, const char *src, size_t size)
-	{
-	size_t l = 0;
-	for(; size > 0 && *dst; size--, dst++)
-		l++;
-	return l + BUF_strlcpy(dst, src, size);
-	}
-
 void BUF_reverse(unsigned char *out, unsigned char *in, size_t size)
 	{
 	size_t i;
diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
index 027302ac86..c314d62312 100644
--- a/src/lib/libcrypto/camellia/asm/cmll-x86.pl
+++ b/src/lib/libcrypto/camellia/asm/cmll-x86.pl
@@ -723,11 +723,11 @@ my $bias=int(@T[0])?shift(@T):0;
 &function_end("Camellia_Ekeygen");
 
 if ($OPENSSL) {
-# int Camellia_set_key (
+# int private_Camellia_set_key (
 #		const unsigned char *userKey,
 #		int bits,
 #		CAMELLIA_KEY *key)
-&function_begin_B("Camellia_set_key");
+&function_begin_B("private_Camellia_set_key");
 	&push	("ebx");
 	&mov	("ecx",&wparam(0));	# pull arguments
 	&mov	("ebx",&wparam(1));
@@ -760,7 +760,7 @@ if ($OPENSSL) {
 &set_label("done",4);
 	&pop	("ebx");
 	&ret	();
-&function_end_B("Camellia_set_key");
+&function_end_B("private_Camellia_set_key");
 }
 
 @SBOX=(
diff --git a/src/lib/libcrypto/camellia/camellia.h b/src/lib/libcrypto/camellia/camellia.h
index cf0457dd97..67911e0adf 100644
--- a/src/lib/libcrypto/camellia/camellia.h
+++ b/src/lib/libcrypto/camellia/camellia.h
@@ -88,6 +88,10 @@ struct camellia_key_st
 	};
 typedef struct camellia_key_st CAMELLIA_KEY;
 
+#ifdef OPENSSL_FIPS
+int private_Camellia_set_key(const unsigned char *userKey, const int bits,
+	CAMELLIA_KEY *key);
+#endif
 int Camellia_set_key(const unsigned char *userKey, const int bits,
 	CAMELLIA_KEY *key);
 
diff --git a/src/lib/libcrypto/camellia/cmll_locl.h b/src/lib/libcrypto/camellia/cmll_locl.h
index 4a4d880d16..246b6ce1d8 100644
--- a/src/lib/libcrypto/camellia/cmll_locl.h
+++ b/src/lib/libcrypto/camellia/cmll_locl.h
@@ -71,7 +71,8 @@
 typedef unsigned int  u32;
 typedef unsigned char u8;
 
-int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, KEY_TABLE_TYPE keyTable);
+int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey,
+		     KEY_TABLE_TYPE keyTable);
 void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], 
 		const KEY_TABLE_TYPE keyTable, u8 ciphertext[]);
 void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], 
@@ -80,4 +81,6 @@ void Camellia_EncryptBlock(int keyBitLength, const u8 plaintext[],
 		const KEY_TABLE_TYPE keyTable, u8 ciphertext[]);
 void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], 
 		const KEY_TABLE_TYPE keyTable, u8 plaintext[]);
+int private_Camellia_set_key(const unsigned char *userKey, const int bits,
+			     CAMELLIA_KEY *key);
 #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */
diff --git a/src/lib/libcrypto/camellia/cmll_misc.c b/src/lib/libcrypto/camellia/cmll_misc.c
index f44689124b..f44d48564c 100644
--- a/src/lib/libcrypto/camellia/cmll_misc.c
+++ b/src/lib/libcrypto/camellia/cmll_misc.c
@@ -50,12 +50,13 @@
  */
  
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 #include <openssl/camellia.h>
 #include "cmll_locl.h"
 
 const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT;
 
-int Camellia_set_key(const unsigned char *userKey, const int bits,
+int private_Camellia_set_key(const unsigned char *userKey, const int bits,
 	CAMELLIA_KEY *key)
 	{
 	if(!userKey || !key)
diff --git a/src/lib/libcrypto/cast/c_skey.c b/src/lib/libcrypto/cast/c_skey.c
index 76e40005c9..cb6bf9fee3 100644
--- a/src/lib/libcrypto/cast/c_skey.c
+++ b/src/lib/libcrypto/cast/c_skey.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/cast.h>
 #include "cast_lcl.h"
 #include "cast_s.h"
@@ -71,8 +72,14 @@
 #define S5 CAST_S_table5
 #define S6 CAST_S_table6
 #define S7 CAST_S_table7
-
 void CAST_set_key(CAST_KEY *key, int len, const unsigned char *data)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(CAST);
+	private_CAST_set_key(key, len, data);
+	}
+void private_CAST_set_key(CAST_KEY *key, int len, const unsigned char *data)
+#endif
 	{
 	CAST_LONG x[16];
 	CAST_LONG z[16];
diff --git a/src/lib/libcrypto/cast/cast.h b/src/lib/libcrypto/cast/cast.h
index 1a264f8143..203922ea2b 100644
--- a/src/lib/libcrypto/cast/cast.h
+++ b/src/lib/libcrypto/cast/cast.h
@@ -83,7 +83,9 @@ typedef struct cast_key_st
 	int short_key;	/* Use reduced rounds for short key */
 	} CAST_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_CAST_set_key(CAST_KEY *key, int len, const unsigned char *data);
+#endif
 void CAST_set_key(CAST_KEY *key, int len, const unsigned char *data);
 void CAST_ecb_encrypt(const unsigned char *in, unsigned char *out, const CAST_KEY *key,
 		      int enc);
diff --git a/src/lib/libcrypto/cmac/cm_ameth.c b/src/lib/libcrypto/cmac/cm_ameth.c
new file mode 100644
index 0000000000..0b8e5670b0
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cm_ameth.c
@@ -0,0 +1,97 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "asn1_locl.h"
+
+/* CMAC "ASN1" method. This is just here to indicate the
+ * maximum CMAC output length and to free up a CMAC
+ * key.
+ */
+
+static int cmac_size(const EVP_PKEY *pkey)
+	{
+	return EVP_MAX_BLOCK_LENGTH;
+	}
+
+static void cmac_key_free(EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr;
+	if (cmctx)
+		CMAC_CTX_free(cmctx);
+	}
+
+const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_CMAC,
+	0,
+
+	"CMAC",
+	"OpenSSL CMAC method",
+
+	0,0,0,0,
+
+	0,0,0,
+
+	cmac_size,
+	0,
+	0,0,0,0,0,0,0,
+
+	cmac_key_free,
+	0,
+	0,0
+	};
+
diff --git a/src/lib/libcrypto/cmac/cm_pmeth.c b/src/lib/libcrypto/cmac/cm_pmeth.c
new file mode 100644
index 0000000000..072228ec7f
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cm_pmeth.c
@@ -0,0 +1,224 @@
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project 2010.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include "cryptlib.h"
+#include <openssl/x509.h>
+#include <openssl/x509v3.h>
+#include <openssl/evp.h>
+#include <openssl/cmac.h>
+#include "evp_locl.h"
+
+/* The context structure and "key" is simply a CMAC_CTX */
+
+static int pkey_cmac_init(EVP_PKEY_CTX *ctx)
+	{
+	ctx->data = CMAC_CTX_new();
+	if (!ctx->data)
+		return 0;
+	ctx->keygen_info_count = 0;
+	return 1;
+	}
+
+static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
+	{
+	if (!pkey_cmac_init(dst))
+		return 0;
+	if (!CMAC_CTX_copy(dst->data, src->data))
+		return 0;
+	return 1;
+	}
+
+static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx)
+	{
+	CMAC_CTX_free(ctx->data);
+	}
+
+static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
+	{
+	CMAC_CTX *cmkey = CMAC_CTX_new();
+	CMAC_CTX *cmctx = ctx->data;
+	if (!cmkey)
+		return 0;
+	if (!CMAC_CTX_copy(cmkey, cmctx))
+		{
+		CMAC_CTX_free(cmkey);
+		return 0;
+		}
+	EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey);
+	
+	return 1;
+	}
+
+static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
+	{
+	if (!CMAC_Update(ctx->pctx->data, data, count))
+		return 0;
+	return 1;
+	}
+
+static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx)
+	{
+	EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT);
+	mctx->update = int_update;
+	return 1;
+	}
+
+static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
+					EVP_MD_CTX *mctx)
+	{
+	return CMAC_Final(ctx->data, sig, siglen);
+	}
+
+static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
+	{
+	CMAC_CTX *cmctx = ctx->data;
+	switch (type)
+		{
+
+		case EVP_PKEY_CTRL_SET_MAC_KEY:
+		if (!p2 || p1 < 0)
+			return 0;
+		if (!CMAC_Init(cmctx, p2, p1, NULL, NULL))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_CIPHER:
+		if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine))
+			return 0;
+		break;
+
+		case EVP_PKEY_CTRL_MD:
+		if (ctx->pkey && !CMAC_CTX_copy(ctx->data,
+					(CMAC_CTX *)ctx->pkey->pkey.ptr))
+			return 0;
+		if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL))
+			return 0;
+		break;
+
+		default:
+		return -2;
+
+		}
+	return 1;
+	}
+
+static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx,
+			const char *type, const char *value)
+	{
+	if (!value)
+		{
+		return 0;
+		}
+	if (!strcmp(type, "key"))
+		{
+		void *p = (void *)value;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY,
+								strlen(p), p);
+		}
+	if (!strcmp(type, "cipher"))
+		{
+		const EVP_CIPHER *c;
+		c = EVP_get_cipherbyname(value);
+		if (!c)
+			return 0;
+		return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c);
+		}
+	if (!strcmp(type, "hexkey"))
+		{
+		unsigned char *key;
+		int r;
+		long keylen;
+		key = string_to_hex(value, &keylen);
+		if (!key)
+			return 0;
+		r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key);
+		OPENSSL_free(key);
+		return r;
+		}
+	return -2;
+	}
+
+const EVP_PKEY_METHOD cmac_pkey_meth = 
+	{
+	EVP_PKEY_CMAC,
+	EVP_PKEY_FLAG_SIGCTX_CUSTOM,
+	pkey_cmac_init,
+	pkey_cmac_copy,
+	pkey_cmac_cleanup,
+
+	0, 0,
+
+	0,
+	pkey_cmac_keygen,
+
+	0, 0,
+
+	0, 0,
+
+	0,0,
+
+	cmac_signctx_init,
+	cmac_signctx,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	0,0,
+
+	pkey_cmac_ctrl,
+	pkey_cmac_ctrl_str
+
+	};
diff --git a/src/lib/libcrypto/cmac/cmac.c b/src/lib/libcrypto/cmac/cmac.c
new file mode 100644
index 0000000000..8b72b09681
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cmac.c
@@ -0,0 +1,308 @@
+/* crypto/cmac/cmac.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cryptlib.h"
+#include <openssl/cmac.h>
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
+struct CMAC_CTX_st
+	{
+	/* Cipher context to use */
+	EVP_CIPHER_CTX cctx;
+	/* Keys k1 and k2 */
+	unsigned char k1[EVP_MAX_BLOCK_LENGTH];
+	unsigned char k2[EVP_MAX_BLOCK_LENGTH];
+	/* Temporary block */
+	unsigned char tbl[EVP_MAX_BLOCK_LENGTH];
+	/* Last (possibly partial) block */
+	unsigned char last_block[EVP_MAX_BLOCK_LENGTH];
+	/* Number of bytes in last block: -1 means context not initialised */
+	int nlast_block;
+	};
+
+
+/* Make temporary keys K1 and K2 */
+
+static void make_kn(unsigned char *k1, unsigned char *l, int bl)
+	{
+	int i;
+	/* Shift block to left, including carry */
+	for (i = 0; i < bl; i++)
+		{
+		k1[i] = l[i] << 1;
+		if (i < bl - 1 && l[i + 1] & 0x80)
+			k1[i] |= 1;
+		}
+	/* If MSB set fixup with R */
+	if (l[0] & 0x80)
+		k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b;
+	}
+
+CMAC_CTX *CMAC_CTX_new(void)
+	{
+	CMAC_CTX *ctx;
+	ctx = OPENSSL_malloc(sizeof(CMAC_CTX));
+	if (!ctx)
+		return NULL;
+	EVP_CIPHER_CTX_init(&ctx->cctx);
+	ctx->nlast_block = -1;
+	return ctx;
+	}
+
+void CMAC_CTX_cleanup(CMAC_CTX *ctx)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		{
+		FIPS_cmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
+	EVP_CIPHER_CTX_cleanup(&ctx->cctx);
+	OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH);
+	OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH);
+	ctx->nlast_block = -1;
+	}
+
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx)
+	{
+	return &ctx->cctx;
+	}
+
+void CMAC_CTX_free(CMAC_CTX *ctx)
+	{
+	CMAC_CTX_cleanup(ctx);
+	OPENSSL_free(ctx);
+	}
+
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in)
+	{
+	int bl;
+	if (in->nlast_block == -1)
+		return 0;
+	if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx))
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&in->cctx);
+	memcpy(out->k1, in->k1, bl);
+	memcpy(out->k2, in->k2, bl);
+	memcpy(out->tbl, in->tbl, bl);
+	memcpy(out->last_block, in->last_block, bl);
+	out->nlast_block = in->nlast_block;
+	return 1;
+	}
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl)
+	{
+	static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->cctx.engine)
+			&& !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+
+			{
+			EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_cipherinit().
+		 */
+		if (!impl && !ctx->cctx.engine)
+			return FIPS_cmac_init(ctx, key, keylen, cipher, NULL);
+		}
+#endif
+	/* All zeros means restart */
+	if (!key && !cipher && !impl && keylen == 0)
+		{
+		/* Not initialised */
+		if (ctx->nlast_block == -1)
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx));
+		ctx->nlast_block = 0;
+		return 1;
+		}
+	/* Initialiase context */
+	if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL))
+		return 0;
+	/* Non-NULL key means initialisation complete */
+	if (key)
+		{
+		int bl;
+		if (!EVP_CIPHER_CTX_cipher(&ctx->cctx))
+			return 0;
+		if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen))
+			return 0;
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv))
+			return 0;
+		bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl))
+			return 0;
+		make_kn(ctx->k1, ctx->tbl, bl);
+		make_kn(ctx->k2, ctx->k1, bl);
+		OPENSSL_cleanse(ctx->tbl, bl);
+		/* Reset context again ready for first data block */
+		if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv))
+			return 0;
+		/* Zero tbl so resume works */
+		memset(ctx->tbl, 0, bl);
+		ctx->nlast_block = 0;
+		}
+	return 1;
+	}
+
+int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen)
+	{
+	const unsigned char *data = in;
+	size_t bl;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_update(ctx, in, dlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	if (dlen == 0)
+		return 1;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	/* Copy into partial block if we need to */
+	if (ctx->nlast_block > 0)
+		{
+		size_t nleft;
+		nleft = bl - ctx->nlast_block;
+		if (dlen < nleft)
+			nleft = dlen;
+		memcpy(ctx->last_block + ctx->nlast_block, data, nleft);
+		dlen -= nleft;
+		ctx->nlast_block += nleft;
+		/* If no more to process return */
+		if (dlen == 0)
+			return 1;
+		data += nleft;
+		/* Else not final block so encrypt it */
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl))
+			return 0;
+		}
+	/* Encrypt all but one of the complete blocks left */
+	while(dlen > bl)
+		{
+		if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl))
+			return 0;
+		dlen -= bl;
+		data += bl;
+		}
+	/* Copy any data left to last block buffer */
+	memcpy(ctx->last_block, data, dlen);
+	ctx->nlast_block = dlen;
+	return 1;
+
+	}
+
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen)
+	{
+	int i, bl, lb;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->cctx.engine)
+		return FIPS_cmac_final(ctx, out, poutlen);
+#endif
+	if (ctx->nlast_block == -1)
+		return 0;
+	bl = EVP_CIPHER_CTX_block_size(&ctx->cctx);
+	*poutlen = (size_t)bl;
+	if (!out)
+		return 1;
+	lb = ctx->nlast_block;
+	/* Is last block complete? */
+	if (lb == bl)
+		{
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k1[i];
+		}
+	else
+		{
+		ctx->last_block[lb] = 0x80;
+		if (bl - lb > 1)
+			memset(ctx->last_block + lb + 1, 0, bl - lb - 1);
+		for (i = 0; i < bl; i++)
+			out[i] = ctx->last_block[i] ^ ctx->k2[i];
+		}
+	if (!EVP_Cipher(&ctx->cctx, out, out, bl))
+		{
+		OPENSSL_cleanse(out, bl);	
+		return 0;
+		}
+	return 1;
+	}
+
+int CMAC_resume(CMAC_CTX *ctx)
+	{
+	if (ctx->nlast_block == -1)
+		return 0;
+	/* The buffer "tbl" containes the last fully encrypted block
+	 * which is the last IV (or all zeroes if no last encrypted block).
+	 * The last block has not been modified since CMAC_final().
+	 * So reinitliasing using the last decrypted block will allow
+	 * CMAC to continue after calling CMAC_Final(). 
+	 */
+	return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl);
+	}
diff --git a/src/lib/libcrypto/cmac/cmac.h b/src/lib/libcrypto/cmac/cmac.h
new file mode 100644
index 0000000000..712e92dced
--- /dev/null
+++ b/src/lib/libcrypto/cmac/cmac.h
@@ -0,0 +1,82 @@
+/* crypto/cmac/cmac.h */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+
+#ifndef HEADER_CMAC_H
+#define HEADER_CMAC_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <openssl/evp.h>
+
+/* Opaque */
+typedef struct CMAC_CTX_st CMAC_CTX;
+
+CMAC_CTX *CMAC_CTX_new(void);
+void CMAC_CTX_cleanup(CMAC_CTX *ctx);
+void CMAC_CTX_free(CMAC_CTX *ctx);
+EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx);
+int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in);
+
+int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, 
+			const EVP_CIPHER *cipher, ENGINE *impl);
+int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen);
+int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen);
+int CMAC_resume(CMAC_CTX *ctx);
+
+#ifdef  __cplusplus
+}
+#endif
+#endif
diff --git a/src/lib/libcrypto/cms/cms.h b/src/lib/libcrypto/cms/cms.h
index 09c45d0412..36994fa6a2 100644
--- a/src/lib/libcrypto/cms/cms.h
+++ b/src/lib/libcrypto/cms/cms.h
@@ -111,6 +111,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo)
 #define CMS_PARTIAL			0x4000
 #define CMS_REUSE_DIGEST		0x8000
 #define CMS_USE_KEYID			0x10000
+#define CMS_DEBUG_DECRYPT		0x20000
 
 const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms);
 
@@ -184,6 +185,8 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert);
 int CMS_decrypt_set1_key(CMS_ContentInfo *cms, 
 				unsigned char *key, size_t keylen,
 				unsigned char *id, size_t idlen);
+int CMS_decrypt_set1_password(CMS_ContentInfo *cms, 
+				unsigned char *pass, ossl_ssize_t passlen);
 
 STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms);
 int CMS_RecipientInfo_type(CMS_RecipientInfo *ri);
@@ -219,6 +222,16 @@ int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri,
 int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, 
 					const unsigned char *id, size_t idlen);
 
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, 
+					unsigned char *pass,
+					ossl_ssize_t passlen);
+
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+					int iter, int wrap_nid, int pbe_nid,
+					unsigned char *pass,
+					ossl_ssize_t passlen,
+					const EVP_CIPHER *kekciph);
+
 int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri);
 	
 int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out,
@@ -330,6 +343,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_F_CHECK_CONTENT				 99
 #define CMS_F_CMS_ADD0_CERT				 164
 #define CMS_F_CMS_ADD0_RECIPIENT_KEY			 100
+#define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD		 165
 #define CMS_F_CMS_ADD1_RECEIPTREQUEST			 158
 #define CMS_F_CMS_ADD1_RECIPIENT_CERT			 101
 #define CMS_F_CMS_ADD1_SIGNER				 102
@@ -344,6 +358,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_F_CMS_DATAINIT				 111
 #define CMS_F_CMS_DECRYPT				 112
 #define CMS_F_CMS_DECRYPT_SET1_KEY			 113
+#define CMS_F_CMS_DECRYPT_SET1_PASSWORD			 166
 #define CMS_F_CMS_DECRYPT_SET1_PKEY			 114
 #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX		 115
 #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO		 116
@@ -378,7 +393,9 @@ void ERR_load_CMS_strings(void);
 #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT		 141
 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS		 142
 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID	 143
+#define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT		 167
 #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY		 144
+#define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD		 168
 #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY		 145
 #define CMS_F_CMS_SET1_SIGNERIDENTIFIER			 146
 #define CMS_F_CMS_SET_DETACHED				 147
@@ -419,6 +436,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_ERROR_SETTING_KEY				 115
 #define CMS_R_ERROR_SETTING_RECIPIENTINFO		 116
 #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH		 117
+#define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER		 176
 #define CMS_R_INVALID_KEY_LENGTH			 118
 #define CMS_R_MD_BIO_INIT_ERROR				 119
 #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH	 120
@@ -431,6 +449,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_NOT_ENCRYPTED_DATA			 122
 #define CMS_R_NOT_KEK					 123
 #define CMS_R_NOT_KEY_TRANSPORT				 124
+#define CMS_R_NOT_PWRI					 177
 #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE		 125
 #define CMS_R_NO_CIPHER					 126
 #define CMS_R_NO_CONTENT				 127
@@ -443,6 +462,7 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_NO_MATCHING_RECIPIENT			 132
 #define CMS_R_NO_MATCHING_SIGNATURE			 166
 #define CMS_R_NO_MSGSIGDIGEST				 167
+#define CMS_R_NO_PASSWORD				 178
 #define CMS_R_NO_PRIVATE_KEY				 133
 #define CMS_R_NO_PUBLIC_KEY				 134
 #define CMS_R_NO_RECEIPT_REQUEST			 168
@@ -466,10 +486,12 @@ void ERR_load_CMS_strings(void);
 #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM		 151
 #define CMS_R_UNSUPPORTED_CONTENT_TYPE			 152
 #define CMS_R_UNSUPPORTED_KEK_ALGORITHM			 153
+#define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM	 179
 #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE		 154
 #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE		 155
 #define CMS_R_UNSUPPORTED_TYPE				 156
 #define CMS_R_UNWRAP_ERROR				 157
+#define CMS_R_UNWRAP_FAILURE				 180
 #define CMS_R_VERIFICATION_FAILURE			 158
 #define CMS_R_WRAP_ERROR				 159
 
diff --git a/src/lib/libcrypto/cms/cms_asn1.c b/src/lib/libcrypto/cms/cms_asn1.c
index fcba4dcbcc..cfe67fb6c1 100644
--- a/src/lib/libcrypto/cms/cms_asn1.c
+++ b/src/lib/libcrypto/cms/cms_asn1.c
@@ -237,6 +237,15 @@ static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
 				OPENSSL_free(kekri->key);
 				}
 			}
+		else if (ri->type == CMS_RECIPINFO_PASS)
+			{
+			CMS_PasswordRecipientInfo *pwri = ri->d.pwri;
+			if (pwri->pass)
+				{
+				OPENSSL_cleanse(pwri->pass, pwri->passlen);
+				OPENSSL_free(pwri->pass);
+				}
+			}
 		}
 	return 1;
 	}
diff --git a/src/lib/libcrypto/cms/cms_enc.c b/src/lib/libcrypto/cms/cms_enc.c
index bab26235bd..f873ce3794 100644
--- a/src/lib/libcrypto/cms/cms_enc.c
+++ b/src/lib/libcrypto/cms/cms_enc.c
@@ -73,6 +73,8 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
 	const EVP_CIPHER *ciph;
 	X509_ALGOR *calg = ec->contentEncryptionAlgorithm;
 	unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL;
+	unsigned char *tkey = NULL;
+	size_t tkeylen;
 
 	int ok = 0;
 
@@ -137,32 +139,57 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
 				CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
 		goto err;
 		}
-
-
-	if (enc && !ec->key)
+	tkeylen = EVP_CIPHER_CTX_key_length(ctx);
+	/* Generate random session key */
+	if (!enc || !ec->key)
 		{
-		/* Generate random key */
-		if (!ec->keylen)
-			ec->keylen = EVP_CIPHER_CTX_key_length(ctx);
-		ec->key = OPENSSL_malloc(ec->keylen);
-		if (!ec->key)
+		tkey = OPENSSL_malloc(tkeylen);
+		if (!tkey)
 			{
 			CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
 							ERR_R_MALLOC_FAILURE);
 			goto err;
 			}
-		if (EVP_CIPHER_CTX_rand_key(ctx, ec->key) <= 0)
+		if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0)
 			goto err;
-		keep_key = 1;
 		}
-	else if (ec->keylen != (unsigned int)EVP_CIPHER_CTX_key_length(ctx))
+
+	if (!ec->key)
+		{
+		ec->key = tkey;
+		ec->keylen = tkeylen;
+		tkey = NULL;
+		if (enc)
+			keep_key = 1;
+		else
+			ERR_clear_error();
+		
+		}
+
+	if (ec->keylen != tkeylen)
 		{
 		/* If necessary set key length */
 		if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0)
 			{
-			CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
-				CMS_R_INVALID_KEY_LENGTH);
-			goto err;
+			/* Only reveal failure if debugging so we don't
+			 * leak information which may be useful in MMA.
+			 */
+			if (enc || ec->debug)
+				{
+				CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO,
+						CMS_R_INVALID_KEY_LENGTH);
+				goto err;
+				}
+			else
+				{
+				/* Use random key */
+				OPENSSL_cleanse(ec->key, ec->keylen);
+				OPENSSL_free(ec->key);
+				ec->key = tkey;
+				ec->keylen = tkeylen;
+				tkey = NULL;
+				ERR_clear_error();
+				}
 			}
 		}
 
@@ -198,6 +225,11 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec)
 		OPENSSL_free(ec->key);
 		ec->key = NULL;
 		}
+	if (tkey)
+		{
+		OPENSSL_cleanse(tkey, tkeylen);
+		OPENSSL_free(tkey);
+		}
 	if (ok)
 		return b;
 	BIO_free(b);
diff --git a/src/lib/libcrypto/cms/cms_env.c b/src/lib/libcrypto/cms/cms_env.c
index b3237d4b94..be20b1c024 100644
--- a/src/lib/libcrypto/cms/cms_env.c
+++ b/src/lib/libcrypto/cms/cms_env.c
@@ -65,14 +65,13 @@
 /* CMS EnvelopedData Utilities */
 
 DECLARE_ASN1_ITEM(CMS_EnvelopedData)
-DECLARE_ASN1_ITEM(CMS_RecipientInfo)
 DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo)
 DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo)
 DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute)
 
 DECLARE_STACK_OF(CMS_RecipientInfo)
 
-static CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms)
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms)
 	{
 	if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped)
 		{
@@ -371,6 +370,8 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
 	unsigned char *ek = NULL;
 	size_t eklen;
 	int ret = 0;
+	CMS_EncryptedContentInfo *ec;
+	ec = cms->d.envelopedData->encryptedContentInfo;
 
 	if (ktri->pkey == NULL)
 		{
@@ -417,8 +418,14 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms,
 
 	ret = 1;
 
-	cms->d.envelopedData->encryptedContentInfo->key = ek;
-	cms->d.envelopedData->encryptedContentInfo->keylen = eklen;
+	if (ec->key)
+		{
+		OPENSSL_cleanse(ec->key, ec->keylen);
+		OPENSSL_free(ec->key);
+		}
+
+	ec->key = ek;
+	ec->keylen = eklen;
 
 	err:
 	if (pctx)
@@ -786,6 +793,9 @@ int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri)
 		case CMS_RECIPINFO_KEK:
 		return cms_RecipientInfo_kekri_decrypt(cms, ri);
 
+		case CMS_RECIPINFO_PASS:
+		return cms_RecipientInfo_pwri_crypt(cms, ri, 0);
+
 		default:
 		CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT,
 			CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE);
@@ -829,6 +839,10 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms)
 			r = cms_RecipientInfo_kekri_encrypt(cms, ri);
 			break;
 
+			case CMS_RECIPINFO_PASS:
+			r = cms_RecipientInfo_pwri_crypt(cms, ri, 1);
+			break;
+
 			default:
 			CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO,
 				CMS_R_UNSUPPORTED_RECIPIENT_TYPE);
diff --git a/src/lib/libcrypto/cms/cms_err.c b/src/lib/libcrypto/cms/cms_err.c
index ff7b0309e5..8330ead7ed 100644
--- a/src/lib/libcrypto/cms/cms_err.c
+++ b/src/lib/libcrypto/cms/cms_err.c
@@ -1,6 +1,6 @@
 /* crypto/cms/cms_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2009 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -73,6 +73,7 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CHECK_CONTENT),	"CHECK_CONTENT"},
 {ERR_FUNC(CMS_F_CMS_ADD0_CERT),	"CMS_add0_cert"},
 {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY),	"CMS_add0_recipient_key"},
+{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD),	"CMS_add0_recipient_password"},
 {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST),	"CMS_add1_ReceiptRequest"},
 {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT),	"CMS_add1_recipient_cert"},
 {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER),	"CMS_add1_signer"},
@@ -87,6 +88,7 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CMS_DATAINIT),	"CMS_dataInit"},
 {ERR_FUNC(CMS_F_CMS_DECRYPT),	"CMS_decrypt"},
 {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY),	"CMS_decrypt_set1_key"},
+{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD),	"CMS_decrypt_set1_password"},
 {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY),	"CMS_decrypt_set1_pkey"},
 {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX),	"cms_DigestAlgorithm_find_ctx"},
 {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO),	"cms_DigestAlgorithm_init_bio"},
@@ -105,7 +107,7 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES),	"CMS_GET0_CERTIFICATE_CHOICES"},
 {ERR_FUNC(CMS_F_CMS_GET0_CONTENT),	"CMS_get0_content"},
 {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE),	"CMS_GET0_ECONTENT_TYPE"},
-{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED),	"CMS_GET0_ENVELOPED"},
+{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED),	"cms_get0_enveloped"},
 {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES),	"CMS_GET0_REVOCATION_CHOICES"},
 {ERR_FUNC(CMS_F_CMS_GET0_SIGNED),	"CMS_GET0_SIGNED"},
 {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1),	"cms_msgSigDigest_add1"},
@@ -121,7 +123,9 @@ static ERR_STRING_DATA CMS_str_functs[]=
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT),	"CMS_RECIPIENTINFO_KTRI_ENCRYPT"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS),	"CMS_RecipientInfo_ktri_get0_algs"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID),	"CMS_RecipientInfo_ktri_get0_signer_id"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT),	"cms_RecipientInfo_pwri_crypt"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY),	"CMS_RecipientInfo_set0_key"},
+{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD),	"CMS_RecipientInfo_set0_password"},
 {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY),	"CMS_RecipientInfo_set0_pkey"},
 {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER),	"cms_set1_SignerIdentifier"},
 {ERR_FUNC(CMS_F_CMS_SET_DETACHED),	"CMS_set_detached"},
@@ -165,6 +169,7 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_ERROR_SETTING_KEY)     ,"error setting key"},
 {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"},
 {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"},
+{ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"},
 {ERR_REASON(CMS_R_INVALID_KEY_LENGTH)    ,"invalid key length"},
 {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR)     ,"md bio init error"},
 {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"},
@@ -177,6 +182,7 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA)    ,"not encrypted data"},
 {ERR_REASON(CMS_R_NOT_KEK)               ,"not kek"},
 {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT)     ,"not key transport"},
+{ERR_REASON(CMS_R_NOT_PWRI)              ,"not pwri"},
 {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"},
 {ERR_REASON(CMS_R_NO_CIPHER)             ,"no cipher"},
 {ERR_REASON(CMS_R_NO_CONTENT)            ,"no content"},
@@ -189,6 +195,7 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"},
 {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"},
 {ERR_REASON(CMS_R_NO_MSGSIGDIGEST)       ,"no msgsigdigest"},
+{ERR_REASON(CMS_R_NO_PASSWORD)           ,"no password"},
 {ERR_REASON(CMS_R_NO_PRIVATE_KEY)        ,"no private key"},
 {ERR_REASON(CMS_R_NO_PUBLIC_KEY)         ,"no public key"},
 {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST)    ,"no receipt request"},
@@ -212,10 +219,12 @@ static ERR_STRING_DATA CMS_str_reasons[]=
 {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"},
 {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"},
 {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"},
+{ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"},
 {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"},
 {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"},
 {ERR_REASON(CMS_R_UNSUPPORTED_TYPE)      ,"unsupported type"},
 {ERR_REASON(CMS_R_UNWRAP_ERROR)          ,"unwrap error"},
+{ERR_REASON(CMS_R_UNWRAP_FAILURE)        ,"unwrap failure"},
 {ERR_REASON(CMS_R_VERIFICATION_FAILURE)  ,"verification failure"},
 {ERR_REASON(CMS_R_WRAP_ERROR)            ,"wrap error"},
 {0,NULL}
diff --git a/src/lib/libcrypto/cms/cms_lcl.h b/src/lib/libcrypto/cms/cms_lcl.h
index c8ecfa724a..a9f9730157 100644
--- a/src/lib/libcrypto/cms/cms_lcl.h
+++ b/src/lib/libcrypto/cms/cms_lcl.h
@@ -175,6 +175,8 @@ struct CMS_EncryptedContentInfo_st
 	const EVP_CIPHER *cipher;
 	unsigned char *key;
 	size_t keylen;
+	/* Set to 1 if we are debugging decrypt and don't fake keys for MMA */
+	int debug;
 	};
 
 struct CMS_RecipientInfo_st
@@ -273,6 +275,9 @@ struct CMS_PasswordRecipientInfo_st
  	X509_ALGOR *keyDerivationAlgorithm;
  	X509_ALGOR *keyEncryptionAlgorithm;
  	ASN1_OCTET_STRING *encryptedKey;
+	/* Extra info: password to use */
+	unsigned char *pass;
+	size_t passlen;
 	};
 
 struct CMS_OtherRecipientInfo_st
@@ -411,6 +416,8 @@ DECLARE_ASN1_ITEM(CMS_SignerInfo)
 DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber)
 DECLARE_ASN1_ITEM(CMS_Attributes_Sign)
 DECLARE_ASN1_ITEM(CMS_Attributes_Verify)
+DECLARE_ASN1_ITEM(CMS_RecipientInfo)
+DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo)
 DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber)
 
 #define CMS_SIGNERINFO_ISSUER_SERIAL	0
@@ -454,6 +461,11 @@ int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src);
 ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si);
 
 BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms);
+CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms);
+
+/* PWRI routines */
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+							int en_de);
 	
 #ifdef  __cplusplus
 }
diff --git a/src/lib/libcrypto/cms/cms_lib.c b/src/lib/libcrypto/cms/cms_lib.c
index d00fe0f87b..f88e8f3b52 100644
--- a/src/lib/libcrypto/cms/cms_lib.c
+++ b/src/lib/libcrypto/cms/cms_lib.c
@@ -412,8 +412,7 @@ int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain,
 		 */
 			|| EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid)
 			{
-			EVP_MD_CTX_copy_ex(mctx, mtmp);
-			return 1;
+			return EVP_MD_CTX_copy_ex(mctx, mtmp);
 			}
 		chain = BIO_next(chain);
 		}
diff --git a/src/lib/libcrypto/cms/cms_pwri.c b/src/lib/libcrypto/cms/cms_pwri.c
new file mode 100644
index 0000000000..b79612a12d
--- /dev/null
+++ b/src/lib/libcrypto/cms/cms_pwri.c
@@ -0,0 +1,454 @@
+/* crypto/cms/cms_pwri.c */
+/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL
+ * project.
+ */
+/* ====================================================================
+ * Copyright (c) 2009 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include "cryptlib.h"
+#include <openssl/asn1t.h>
+#include <openssl/pem.h>
+#include <openssl/x509v3.h>
+#include <openssl/err.h>
+#include <openssl/cms.h>
+#include <openssl/rand.h>
+#include <openssl/aes.h>
+#include "cms_lcl.h"
+#include "asn1_locl.h"
+
+int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, 
+				unsigned char *pass, ossl_ssize_t passlen)
+	{
+	CMS_PasswordRecipientInfo *pwri;
+	if (ri->type != CMS_RECIPINFO_PASS)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI);
+		return 0;
+		}
+
+	pwri = ri->d.pwri;
+	pwri->pass = pass;
+	if (pass && passlen < 0)
+		passlen = strlen((char *)pass);
+	pwri->passlen = passlen;
+	return 1;
+	}
+
+CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms,
+					int iter, int wrap_nid, int pbe_nid,
+					unsigned char *pass,
+					ossl_ssize_t passlen,
+					const EVP_CIPHER *kekciph)
+	{
+	CMS_RecipientInfo *ri = NULL;
+	CMS_EnvelopedData *env;
+	CMS_PasswordRecipientInfo *pwri;
+	EVP_CIPHER_CTX ctx;
+	X509_ALGOR *encalg = NULL;
+	unsigned char iv[EVP_MAX_IV_LENGTH];
+	int ivlen;
+	env = cms_get0_enveloped(cms);
+	if (!env)
+		goto err;
+
+	if (wrap_nid <= 0)
+		wrap_nid = NID_id_alg_PWRI_KEK;
+
+	if (pbe_nid <= 0)
+		pbe_nid = NID_id_pbkdf2;
+
+	/* Get from enveloped data */
+	if (kekciph == NULL)
+		kekciph = env->encryptedContentInfo->cipher;
+
+	if (kekciph == NULL)
+		{
+		CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER);
+		return NULL;
+		}
+	if (wrap_nid != NID_id_alg_PWRI_KEK)
+		{
+		CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+				CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+		return NULL;
+		}
+
+	/* Setup algorithm identifier for cipher */
+	encalg = X509_ALGOR_new();
+	EVP_CIPHER_CTX_init(&ctx);
+
+	if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0)
+		{
+		CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB);
+		goto err;
+		}
+
+	ivlen = EVP_CIPHER_CTX_iv_length(&ctx);
+
+	if (ivlen > 0)
+		{
+		if (RAND_pseudo_bytes(iv, ivlen) <= 0)
+			goto err;
+		if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+							ERR_R_EVP_LIB);
+			goto err;
+			}
+		encalg->parameter = ASN1_TYPE_new();
+		if (!encalg->parameter)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+							ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0)
+			{
+			CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD,
+				CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+			goto err;
+			}
+		}
+
+
+	encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx));
+
+	EVP_CIPHER_CTX_cleanup(&ctx);
+
+	/* Initialize recipient info */
+	ri = M_ASN1_new_of(CMS_RecipientInfo);
+	if (!ri)
+		goto merr;
+
+	ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo);
+	if (!ri->d.pwri)
+		goto merr;
+	ri->type = CMS_RECIPINFO_PASS;
+
+	pwri = ri->d.pwri;
+	/* Since this is overwritten, free up empty structure already there */
+	X509_ALGOR_free(pwri->keyEncryptionAlgorithm);
+	pwri->keyEncryptionAlgorithm = X509_ALGOR_new();
+	if (!pwri->keyEncryptionAlgorithm)
+		goto merr;
+	pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid);
+	pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new();
+	if (!pwri->keyEncryptionAlgorithm->parameter)
+		goto merr;
+
+        if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR),
+	    &pwri->keyEncryptionAlgorithm->parameter->value.sequence))
+		goto merr;
+        pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE;
+
+	X509_ALGOR_free(encalg);
+	encalg = NULL;
+
+	/* Setup PBE algorithm */
+
+	pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1);
+
+	if (!pwri->keyDerivationAlgorithm)
+		goto err;
+
+	CMS_RecipientInfo_set0_password(ri, pass, passlen);
+	pwri->version = 0;
+
+	if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri))
+		goto merr;
+
+	return ri;
+
+	merr:
+	CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE);
+	err:
+	EVP_CIPHER_CTX_cleanup(&ctx);
+	if (ri)
+		M_ASN1_free_of(ri, CMS_RecipientInfo);
+	if (encalg)
+		X509_ALGOR_free(encalg);
+	return NULL;
+
+	}
+
+/* This is an implementation of the key wrapping mechanism in RFC3211,
+ * at some point this should go into EVP.
+ */
+
+static int kek_unwrap_key(unsigned char *out, size_t *outlen,
+		const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+	{
+	size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+	unsigned char *tmp;
+	int outl, rv = 0;
+	if (inlen < 2 * blocklen)
+		{
+		/* too small */
+		return 0;
+		}
+	if (inlen % blocklen)
+		{
+		/* Invalid size */
+		return 0;
+		}
+	tmp = OPENSSL_malloc(inlen);
+	/* setup IV by decrypting last two blocks */
+	EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl,
+				in  + inlen - 2 * blocklen, blocklen * 2);
+	/* Do a decrypt of last decrypted block to set IV to correct value
+	 * output it to start of buffer so we don't corrupt decrypted block
+	 * this works because buffer is at least two block lengths long.
+	 */
+	EVP_DecryptUpdate(ctx, tmp, &outl,
+				tmp  + inlen - blocklen, blocklen);
+	/* Can now decrypt first n - 1 blocks */
+	EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen);
+
+	/* Reset IV to original value */
+	EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL);
+	/* Decrypt again */
+	EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen);
+	/* Check check bytes */
+	if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff)
+		{
+		/* Check byte failure */
+		goto err;
+		}
+	if (inlen < (size_t)(tmp[0] - 4 ))
+		{
+		/* Invalid length value */
+		goto err;
+		}
+	*outlen = (size_t)tmp[0];
+	memcpy(out, tmp + 4, *outlen);
+	rv = 1;
+	err:
+	OPENSSL_cleanse(tmp, inlen);
+	OPENSSL_free(tmp);
+	return rv;
+
+	}
+
+static int kek_wrap_key(unsigned char *out, size_t *outlen,
+		const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx)
+	{
+	size_t blocklen = EVP_CIPHER_CTX_block_size(ctx);
+	size_t olen;
+	int dummy;
+	/* First decide length of output buffer: need header and round up to
+	 * multiple of block length.
+	 */
+	olen = (inlen + 4 + blocklen - 1)/blocklen;
+	olen *= blocklen;
+	if (olen < 2 * blocklen)
+		{
+		/* Key too small */
+		return 0;
+		}
+	if (inlen > 0xFF)
+		{
+		/* Key too large */
+		return 0;
+		}
+	if (out)
+		{
+		/* Set header */
+		out[0] = (unsigned char)inlen;
+		out[1] = in[0] ^ 0xFF;
+		out[2] = in[1] ^ 0xFF;
+		out[3] = in[2] ^ 0xFF;
+		memcpy(out + 4, in, inlen);
+		/* Add random padding to end */
+		if (olen > inlen + 4)
+			RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen);
+		/* Encrypt twice */
+		EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+		EVP_EncryptUpdate(ctx, out, &dummy, out, olen);
+		}
+
+	*outlen = olen;
+
+	return 1;
+	}
+
+/* Encrypt/Decrypt content key in PWRI recipient info */
+
+int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri,
+							int en_de)
+	{
+	CMS_EncryptedContentInfo *ec;
+	CMS_PasswordRecipientInfo *pwri;
+	const unsigned char *p = NULL;
+	int plen;
+	int r = 0;
+	X509_ALGOR *algtmp, *kekalg = NULL;
+	EVP_CIPHER_CTX kekctx;
+	const EVP_CIPHER *kekcipher;
+	unsigned char *key = NULL;
+	size_t keylen;
+
+	ec = cms->d.envelopedData->encryptedContentInfo;
+
+	pwri = ri->d.pwri;
+	EVP_CIPHER_CTX_init(&kekctx);
+
+	if (!pwri->pass)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD);
+		return 0;
+		}
+	algtmp = pwri->keyEncryptionAlgorithm;
+
+	if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM);
+		return 0;
+		}
+
+	if (algtmp->parameter->type == V_ASN1_SEQUENCE)
+		{
+		p = algtmp->parameter->value.sequence->data;
+		plen = algtmp->parameter->value.sequence->length;
+		kekalg = d2i_X509_ALGOR(NULL, &p, plen);
+		}
+	if (kekalg == NULL)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER);
+		return 0;
+		}
+
+	kekcipher = EVP_get_cipherbyobj(kekalg->algorithm);
+		
+	if(!kekcipher)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_UNKNOWN_CIPHER);
+		goto err;
+		}
+
+	/* Fixup cipher based on AlgorithmIdentifier to set IV etc */
+	if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de))
+		goto err;
+	EVP_CIPHER_CTX_set_padding(&kekctx, 0);
+	if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+				CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR);
+		goto err;
+		}
+
+	algtmp = pwri->keyDerivationAlgorithm;
+
+	/* Finish password based key derivation to setup key in "ctx" */
+
+	if (EVP_PBE_CipherInit(algtmp->algorithm,
+				(char *)pwri->pass, pwri->passlen,
+				algtmp->parameter, &kekctx, en_de) < 0)
+		{
+		CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB);
+		goto err;
+		}
+
+	/* Finally wrap/unwrap the key */
+
+	if (en_de)
+		{
+
+		if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx))
+			goto err;
+
+		key = OPENSSL_malloc(keylen);
+
+		if (!key)
+			goto err;
+
+		if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx))
+			goto err;
+		pwri->encryptedKey->data = key;
+		pwri->encryptedKey->length = keylen;
+		}
+	else
+		{
+		key = OPENSSL_malloc(pwri->encryptedKey->length);
+
+		if (!key)
+			{
+			CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+							ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+		if (!kek_unwrap_key(key, &keylen,
+					pwri->encryptedKey->data,
+					pwri->encryptedKey->length, &kekctx))
+			{
+			CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT,
+							CMS_R_UNWRAP_FAILURE);
+			goto err;
+			}
+
+		ec->key = key;
+		ec->keylen = keylen;
+
+		}
+
+	r = 1;
+
+	err:
+
+	EVP_CIPHER_CTX_cleanup(&kekctx);
+
+	if (!r && key)
+		OPENSSL_free(key);
+	X509_ALGOR_free(kekalg);
+
+	return r;
+
+	}
diff --git a/src/lib/libcrypto/cms/cms_sd.c b/src/lib/libcrypto/cms/cms_sd.c
index e3192b9c57..77fbd13596 100644
--- a/src/lib/libcrypto/cms/cms_sd.c
+++ b/src/lib/libcrypto/cms/cms_sd.c
@@ -641,7 +641,8 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms,
 			cms->d.signedData->encapContentInfo->eContentType; 
 		unsigned char md[EVP_MAX_MD_SIZE];
 		unsigned int mdlen;
-		EVP_DigestFinal_ex(&mctx, md, &mdlen);
+		if (!EVP_DigestFinal_ex(&mctx, md, &mdlen))
+			goto err;
 		if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest,
 						V_ASN1_OCTET_STRING,
 						md, mdlen))
diff --git a/src/lib/libcrypto/cms/cms_smime.c b/src/lib/libcrypto/cms/cms_smime.c
index 4a799eb897..8c56e3a852 100644
--- a/src/lib/libcrypto/cms/cms_smime.c
+++ b/src/lib/libcrypto/cms/cms_smime.c
@@ -611,7 +611,10 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert)
 	STACK_OF(CMS_RecipientInfo) *ris;
 	CMS_RecipientInfo *ri;
 	int i, r;
+	int debug = 0;
 	ris = CMS_get0_RecipientInfos(cms);
+	if (ris)
+		debug = cms->d.envelopedData->encryptedContentInfo->debug;
 	for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
 		{
 		ri = sk_CMS_RecipientInfo_value(ris, i);
@@ -625,17 +628,38 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert)
 			CMS_RecipientInfo_set0_pkey(ri, pk);
 			r = CMS_RecipientInfo_decrypt(cms, ri);
 			CMS_RecipientInfo_set0_pkey(ri, NULL);
-			if (r > 0)
-				return 1;
 			if (cert)
 				{
+				/* If not debugging clear any error and
+				 * return success to avoid leaking of
+				 * information useful to MMA
+				 */
+				if (!debug)
+					{
+					ERR_clear_error();
+					return 1;
+					}
+				if (r > 0)
+					return 1;
 				CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY,
 						CMS_R_DECRYPT_ERROR);
 				return 0;
 				}
-			ERR_clear_error();
+			/* If no cert and not debugging don't leave loop
+			 * after first successful decrypt. Always attempt
+			 * to decrypt all recipients to avoid leaking timing
+			 * of a successful decrypt.
+			 */
+			else if (r > 0 && debug)
+				return 1;
 			}
 		}
+	/* If no cert and not debugging always return success */
+	if (!cert && !debug)
+		{
+		ERR_clear_error();
+		return 1;
+		}
 
 	CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY, CMS_R_NO_MATCHING_RECIPIENT);
 	return 0;
@@ -680,6 +704,30 @@ int CMS_decrypt_set1_key(CMS_ContentInfo *cms,
 	return 0;
 
 	}
+
+int CMS_decrypt_set1_password(CMS_ContentInfo *cms, 
+				unsigned char *pass, ossl_ssize_t passlen)
+	{
+	STACK_OF(CMS_RecipientInfo) *ris;
+	CMS_RecipientInfo *ri;
+	int i, r;
+	ris = CMS_get0_RecipientInfos(cms);
+	for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++)
+		{
+		ri = sk_CMS_RecipientInfo_value(ris, i);
+		if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_PASS)
+				continue;
+		CMS_RecipientInfo_set0_password(ri, pass, passlen);
+		r = CMS_RecipientInfo_decrypt(cms, ri);
+		CMS_RecipientInfo_set0_password(ri, NULL, 0);
+		if (r > 0)
+			return 1;
+		}
+
+	CMSerr(CMS_F_CMS_DECRYPT_SET1_PASSWORD, CMS_R_NO_MATCHING_RECIPIENT);
+	return 0;
+
+	}
 	
 int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert,
 				BIO *dcont, BIO *out,
@@ -694,9 +742,14 @@ int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert,
 		}
 	if (!dcont && !check_content(cms))
 		return 0;
+	if (flags & CMS_DEBUG_DECRYPT)
+		cms->d.envelopedData->encryptedContentInfo->debug = 1;
+	else
+		cms->d.envelopedData->encryptedContentInfo->debug = 0;
+	if (!pk && !cert && !dcont && !out)
+		return 1;
 	if (pk && !CMS_decrypt_set1_pkey(cms, pk, cert))
 		return 0;
-
 	cont = CMS_dataInit(cms, dcont);
 	if (!cont)
 		return 0;
diff --git a/src/lib/libcrypto/comp/c_rle.c b/src/lib/libcrypto/comp/c_rle.c
index 18bceae51e..47dfb67fbd 100644
--- a/src/lib/libcrypto/comp/c_rle.c
+++ b/src/lib/libcrypto/comp/c_rle.c
@@ -30,7 +30,7 @@ static int rle_compress_block(COMP_CTX *ctx, unsigned char *out,
 	{
 	/* int i; */
 
-	if (olen < (ilen+1))
+	if (ilen == 0 || olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);
@@ -46,7 +46,7 @@ static int rle_expand_block(COMP_CTX *ctx, unsigned char *out,
 	{
 	int i;
 
-	if (ilen == 0 || olen < (ilen-1))
+	if (olen < (ilen-1))
 		{
 		/* ZZZZZZZZZZZZZZZZZZZZZZ */
 		return(-1);
diff --git a/src/lib/libcrypto/cpt_err.c b/src/lib/libcrypto/cpt_err.c
index 139b9284e4..289005f662 100644
--- a/src/lib/libcrypto/cpt_err.c
+++ b/src/lib/libcrypto/cpt_err.c
@@ -1,6 +1,6 @@
 /* crypto/cpt_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
 {ERR_FUNC(CRYPTO_F_CRYPTO_SET_EX_DATA),	"CRYPTO_set_ex_data"},
 {ERR_FUNC(CRYPTO_F_DEF_ADD_INDEX),	"DEF_ADD_INDEX"},
 {ERR_FUNC(CRYPTO_F_DEF_GET_CLASS),	"DEF_GET_CLASS"},
+{ERR_FUNC(CRYPTO_F_FIPS_MODE_SET),	"FIPS_mode_set"},
 {ERR_FUNC(CRYPTO_F_INT_DUP_EX_DATA),	"INT_DUP_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_FREE_EX_DATA),	"INT_FREE_EX_DATA"},
 {ERR_FUNC(CRYPTO_F_INT_NEW_EX_DATA),	"INT_NEW_EX_DATA"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]=
 
 static ERR_STRING_DATA CRYPTO_str_reasons[]=
 	{
+{ERR_REASON(CRYPTO_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"},
 {ERR_REASON(CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK),"no dynlock create callback"},
 {0,NULL}
 	};
diff --git a/src/lib/libcrypto/cryptlib.c b/src/lib/libcrypto/cryptlib.c
index 24fe123e14..766ea8cac7 100644
--- a/src/lib/libcrypto/cryptlib.c
+++ b/src/lib/libcrypto/cryptlib.c
@@ -409,6 +409,10 @@ int (*CRYPTO_get_add_lock_callback(void))(int *num,int mount,int type,
 void CRYPTO_set_locking_callback(void (*func)(int mode,int type,
 					      const char *file,int line))
 	{
+	/* Calling this here ensures initialisation before any threads
+	 * are started.
+	 */
+	OPENSSL_init();
 	locking_callback=func;
 	}
 
@@ -661,28 +665,52 @@ const char *CRYPTO_get_lock_name(int type)
 	defined(__INTEL__) || \
 	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64)
 
-unsigned long  OPENSSL_ia32cap_P=0;
-unsigned long *OPENSSL_ia32cap_loc(void) { return &OPENSSL_ia32cap_P; }
+unsigned int  OPENSSL_ia32cap_P[2];
+unsigned long *OPENSSL_ia32cap_loc(void)
+{   if (sizeof(long)==4)
+	/*
+	 * If 32-bit application pulls address of OPENSSL_ia32cap_P[0]
+	 * clear second element to maintain the illusion that vector
+	 * is 32-bit.
+	 */
+	OPENSSL_ia32cap_P[1]=0;
+    return (unsigned long *)OPENSSL_ia32cap_P;
+}
 
 #if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY)
 #define OPENSSL_CPUID_SETUP
+#if defined(_WIN32)
+typedef unsigned __int64 IA32CAP;
+#else
+typedef unsigned long long IA32CAP;
+#endif
 void OPENSSL_cpuid_setup(void)
 { static int trigger=0;
-  unsigned long OPENSSL_ia32_cpuid(void);
+  IA32CAP OPENSSL_ia32_cpuid(void);
+  IA32CAP vec;
   char *env;
 
     if (trigger)	return;
 
     trigger=1;
-    if ((env=getenv("OPENSSL_ia32cap")))
-	OPENSSL_ia32cap_P = strtoul(env,NULL,0)|(1<<10);
+    if ((env=getenv("OPENSSL_ia32cap"))) {
+	int off = (env[0]=='~')?1:0;
+#if defined(_WIN32)
+	if (!sscanf(env+off,"%I64i",&vec)) vec = strtoul(env+off,NULL,0);
+#else
+	if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0);
+#endif
+	if (off) vec = OPENSSL_ia32_cpuid()&~vec;
+    }
     else
-	OPENSSL_ia32cap_P = OPENSSL_ia32_cpuid()|(1<<10);
+	vec = OPENSSL_ia32_cpuid();
     /*
      * |(1<<10) sets a reserved bit to signal that variable
      * was initialized already... This is to avoid interference
      * with cpuid snippets in ELF .init segment.
      */
+    OPENSSL_ia32cap_P[0] = (unsigned int)vec|(1<<10);
+    OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32);
 }
 #endif
 
diff --git a/src/lib/libcrypto/cryptlib.h b/src/lib/libcrypto/cryptlib.h
index fc249c57f3..1761f6b668 100644
--- a/src/lib/libcrypto/cryptlib.h
+++ b/src/lib/libcrypto/cryptlib.h
@@ -99,7 +99,7 @@ extern "C" {
 #define HEX_SIZE(type)		(sizeof(type)*2)
 
 void OPENSSL_cpuid_setup(void);
-extern unsigned long OPENSSL_ia32cap_P;
+extern unsigned int OPENSSL_ia32cap_P[];
 void OPENSSL_showfatal(const char *,...);
 void *OPENSSL_stderr(void);
 extern int OPENSSL_NONPIC_relocated;
diff --git a/src/lib/libcrypto/crypto.h b/src/lib/libcrypto/crypto.h
index b0360cec51..6aeda0a9ac 100644
--- a/src/lib/libcrypto/crypto.h
+++ b/src/lib/libcrypto/crypto.h
@@ -547,6 +547,33 @@ unsigned long *OPENSSL_ia32cap_loc(void);
 #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc()))
 int OPENSSL_isservice(void);
 
+int FIPS_mode(void);
+int FIPS_mode_set(int r);
+
+void OPENSSL_init(void);
+
+#define fips_md_init(alg) fips_md_init_ctx(alg, alg)
+
+#ifdef OPENSSL_FIPS
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c) \
+	{ \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to digest " #alg " forbidden in FIPS mode!"); \
+	return private_##alg##_Init(c); \
+	} \
+	int private_##alg##_Init(cx##_CTX *c)
+
+#define fips_cipher_abort(alg) \
+	if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \
+		"Low level API call to cipher " #alg " forbidden in FIPS mode!")
+
+#else
+#define fips_md_init_ctx(alg, cx) \
+	int alg##_Init(cx##_CTX *c)
+#define fips_cipher_abort(alg) while(0)
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -562,11 +589,13 @@ void ERR_load_CRYPTO_strings(void);
 #define CRYPTO_F_CRYPTO_SET_EX_DATA			 102
 #define CRYPTO_F_DEF_ADD_INDEX				 104
 #define CRYPTO_F_DEF_GET_CLASS				 105
+#define CRYPTO_F_FIPS_MODE_SET				 109
 #define CRYPTO_F_INT_DUP_EX_DATA			 106
 #define CRYPTO_F_INT_FREE_EX_DATA			 107
 #define CRYPTO_F_INT_NEW_EX_DATA			 108
 
 /* Reason codes. */
+#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED		 101
 #define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK		 100
 
 #ifdef  __cplusplus
diff --git a/src/lib/libcrypto/des/des.h b/src/lib/libcrypto/des/des.h
index 92b6663599..1eaedcbd24 100644
--- a/src/lib/libcrypto/des/des.h
+++ b/src/lib/libcrypto/des/des.h
@@ -224,6 +224,9 @@ int DES_set_key(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule);
 int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule);
 void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#ifdef OPENSSL_FIPS
+void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule);
+#endif
 void DES_string_to_key(const char *str,DES_cblock *key);
 void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2);
 void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length,
diff --git a/src/lib/libcrypto/des/set_key.c b/src/lib/libcrypto/des/set_key.c
index 3004cc3ab3..d3e69ca8b5 100644
--- a/src/lib/libcrypto/des/set_key.c
+++ b/src/lib/libcrypto/des/set_key.c
@@ -65,6 +65,8 @@
  */
 #include "des_locl.h"
 
+#include <openssl/crypto.h>
+
 OPENSSL_IMPLEMENT_GLOBAL(int,DES_check_key,0)	/* defaults to false */
 
 static const unsigned char odd_parity[256]={
@@ -335,6 +337,13 @@ int DES_set_key_checked(const_DES_cblock *key, DES_key_schedule *schedule)
 	}
 
 void DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(DES);
+	private_DES_set_key_unchecked(key, schedule);
+	}
+void private_DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule)
+#endif
 	{
 	static const int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0};
 	register DES_LONG c,d,t,s,t2;
diff --git a/src/lib/libcrypto/dh/dh.h b/src/lib/libcrypto/dh/dh.h
index 849309a489..ea59e610ef 100644
--- a/src/lib/libcrypto/dh/dh.h
+++ b/src/lib/libcrypto/dh/dh.h
@@ -86,6 +86,21 @@
                                        * be used for all exponents.
                                        */
 
+/* If this flag is set the DH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DH_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DH_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -230,6 +245,9 @@ void ERR_load_DH_strings(void);
 #define DH_F_COMPUTE_KEY				 102
 #define DH_F_DHPARAMS_PRINT_FP				 101
 #define DH_F_DH_BUILTIN_GENPARAMS			 106
+#define DH_F_DH_COMPUTE_KEY				 114
+#define DH_F_DH_GENERATE_KEY				 115
+#define DH_F_DH_GENERATE_PARAMETERS_EX			 116
 #define DH_F_DH_NEW_METHOD				 105
 #define DH_F_DH_PARAM_DECODE				 107
 #define DH_F_DH_PRIV_DECODE				 110
@@ -249,7 +267,9 @@ void ERR_load_DH_strings(void);
 #define DH_R_DECODE_ERROR				 104
 #define DH_R_INVALID_PUBKEY				 102
 #define DH_R_KEYS_NOT_SET				 108
+#define DH_R_KEY_SIZE_TOO_SMALL				 110
 #define DH_R_MODULUS_TOO_LARGE				 103
+#define DH_R_NON_FIPS_METHOD				 111
 #define DH_R_NO_PARAMETERS_SET				 107
 #define DH_R_NO_PRIVATE_VALUE				 100
 #define DH_R_PARAMETER_ENCODING_ERROR			 105
diff --git a/src/lib/libcrypto/dh/dh_ameth.c b/src/lib/libcrypto/dh/dh_ameth.c
index 377caf96c9..02ec2d47b4 100644
--- a/src/lib/libcrypto/dh/dh_ameth.c
+++ b/src/lib/libcrypto/dh/dh_ameth.c
@@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth =
 	dh_copy_parameters,
 	dh_cmp_parameters,
 	dh_param_print,
+	0,
 
 	int_dh_free,
 	0
diff --git a/src/lib/libcrypto/dh/dh_err.c b/src/lib/libcrypto/dh/dh_err.c
index d5cf0c22a3..56d3df7356 100644
--- a/src/lib/libcrypto/dh/dh_err.c
+++ b/src/lib/libcrypto/dh/dh_err.c
@@ -1,6 +1,6 @@
 /* crypto/dh/dh_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -73,6 +73,9 @@ static ERR_STRING_DATA DH_str_functs[]=
 {ERR_FUNC(DH_F_COMPUTE_KEY),	"COMPUTE_KEY"},
 {ERR_FUNC(DH_F_DHPARAMS_PRINT_FP),	"DHparams_print_fp"},
 {ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS),	"DH_BUILTIN_GENPARAMS"},
+{ERR_FUNC(DH_F_DH_COMPUTE_KEY),	"DH_compute_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_KEY),	"DH_generate_key"},
+{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX),	"DH_generate_parameters_ex"},
 {ERR_FUNC(DH_F_DH_NEW_METHOD),	"DH_new_method"},
 {ERR_FUNC(DH_F_DH_PARAM_DECODE),	"DH_PARAM_DECODE"},
 {ERR_FUNC(DH_F_DH_PRIV_DECODE),	"DH_PRIV_DECODE"},
@@ -95,7 +98,9 @@ static ERR_STRING_DATA DH_str_reasons[]=
 {ERR_REASON(DH_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(DH_R_INVALID_PUBKEY)         ,"invalid public key"},
 {ERR_REASON(DH_R_KEYS_NOT_SET)           ,"keys not set"},
+{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL)     ,"key size too small"},
 {ERR_REASON(DH_R_MODULUS_TOO_LARGE)      ,"modulus too large"},
+{ERR_REASON(DH_R_NON_FIPS_METHOD)        ,"non fips method"},
 {ERR_REASON(DH_R_NO_PARAMETERS_SET)      ,"no parameters set"},
 {ERR_REASON(DH_R_NO_PRIVATE_VALUE)       ,"no private value"},
 {ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
diff --git a/src/lib/libcrypto/dh/dh_gen.c b/src/lib/libcrypto/dh/dh_gen.c
index cfd5b11868..7b1fe9c9cb 100644
--- a/src/lib/libcrypto/dh/dh_gen.c
+++ b/src/lib/libcrypto/dh/dh_gen.c
@@ -66,12 +66,29 @@
 #include <openssl/bn.h>
 #include <openssl/dh.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dh_builtin_genparams(DH *ret, int prime_len, int generator, BN_GENCB *cb);
 
 int DH_generate_parameters_ex(DH *ret, int prime_len, int generator, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_PARAMETERS_EX, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->generate_params)
 		return ret->meth->generate_params(ret, prime_len, generator, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dh_generate_parameters_ex(ret, prime_len,
+							generator, cb);
+#endif
 	return dh_builtin_genparams(ret, prime_len, generator, cb);
 	}
 
diff --git a/src/lib/libcrypto/dh/dh_key.c b/src/lib/libcrypto/dh/dh_key.c
index e7db440342..89a74db4e6 100644
--- a/src/lib/libcrypto/dh/dh_key.c
+++ b/src/lib/libcrypto/dh/dh_key.c
@@ -73,11 +73,27 @@ static int dh_finish(DH *dh);
 
 int DH_generate_key(DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_GENERATE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->generate_key(dh);
 	}
 
 int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD)
+			&& !(dh->flags & DH_FLAG_NON_FIPS_ALLOW))
+		{
+		DHerr(DH_F_DH_COMPUTE_KEY, DH_R_NON_FIPS_METHOD);
+		return 0;
+		}
+#endif
 	return dh->meth->compute_key(key, pub_key, dh);
 	}
 
@@ -138,8 +154,21 @@ static int generate_key(DH *dh)
 
 	if (generate_new_key)
 		{
-		l = dh->length ? dh->length : BN_num_bits(dh->p)-1; /* secret exponent length */
-		if (!BN_rand(priv_key, l, 0, 0)) goto err;
+		if (dh->q)
+			{
+			do
+				{
+				if (!BN_rand_range(priv_key, dh->q))
+					goto err;
+				}
+			while (BN_is_zero(priv_key) || BN_is_one(priv_key));
+			}
+		else
+			{
+			/* secret exponent length */
+			l = dh->length ? dh->length : BN_num_bits(dh->p)-1;
+			if (!BN_rand(priv_key, l, 0, 0)) goto err;
+			}
 		}
 
 	{
diff --git a/src/lib/libcrypto/dh/dh_lib.c b/src/lib/libcrypto/dh/dh_lib.c
index 7aef080e7a..00218f2b92 100644
--- a/src/lib/libcrypto/dh/dh_lib.c
+++ b/src/lib/libcrypto/dh/dh_lib.c
@@ -64,6 +64,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DH_version[]="Diffie-Hellman" OPENSSL_VERSION_PTEXT;
 
 static const DH_METHOD *default_DH_method = NULL;
@@ -76,7 +80,16 @@ void DH_set_default_method(const DH_METHOD *meth)
 const DH_METHOD *DH_get_default_method(void)
 	{
 	if(!default_DH_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dh_openssl();
+		else
+			return DH_OpenSSL();
+#else
 		default_DH_method = DH_OpenSSL();
+#endif
+		}
 	return default_DH_method;
 	}
 
@@ -156,7 +169,7 @@ DH *DH_new_method(ENGINE *engine)
 	ret->counter = NULL;
 	ret->method_mont_p=NULL;
 	ret->references = 1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DH_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DH, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{
diff --git a/src/lib/libcrypto/doc/EVP_DigestInit.pod b/src/lib/libcrypto/doc/EVP_DigestInit.pod
index 5b477ac6ec..367691cc7a 100644
--- a/src/lib/libcrypto/doc/EVP_DigestInit.pod
+++ b/src/lib/libcrypto/doc/EVP_DigestInit.pod
@@ -6,7 +6,8 @@ EVP_MD_CTX_init, EVP_MD_CTX_create, EVP_DigestInit_ex, EVP_DigestUpdate,
 EVP_DigestFinal_ex, EVP_MD_CTX_cleanup, EVP_MD_CTX_destroy, EVP_MAX_MD_SIZE,
 EVP_MD_CTX_copy_ex, EVP_MD_CTX_copy, EVP_MD_type, EVP_MD_pkey_type, EVP_MD_size,
 EVP_MD_block_size, EVP_MD_CTX_md, EVP_MD_CTX_size, EVP_MD_CTX_block_size, EVP_MD_CTX_type,
-EVP_md_null, EVP_md2, EVP_md5, EVP_sha, EVP_sha1, EVP_dss, EVP_dss1, EVP_mdc2,
+EVP_md_null, EVP_md2, EVP_md5, EVP_sha, EVP_sha1, EVP_sha224, EVP_sha256,
+EVP_sha384, EVP_sha512, EVP_dss, EVP_dss1, EVP_mdc2,
 EVP_ripemd160, EVP_get_digestbyname, EVP_get_digestbynid, EVP_get_digestbyobj -
 EVP digest routines
 
@@ -33,16 +34,15 @@ EVP digest routines
 
  int EVP_MD_CTX_copy(EVP_MD_CTX *out,EVP_MD_CTX *in);  
 
- #define EVP_MAX_MD_SIZE (16+20) /* The SSLv3 md5+sha1 type */
+ #define EVP_MAX_MD_SIZE 64	/* SHA512 */
 
+ int EVP_MD_type(const EVP_MD *md);
+ int EVP_MD_pkey_type(const EVP_MD *md);	
+ int EVP_MD_size(const EVP_MD *md);
+ int EVP_MD_block_size(const EVP_MD *md);
 
- #define EVP_MD_type(e)			((e)->type)
- #define EVP_MD_pkey_type(e)		((e)->pkey_type)
- #define EVP_MD_size(e)			((e)->md_size)
- #define EVP_MD_block_size(e)		((e)->block_size)
-
- #define EVP_MD_CTX_md(e)		(e)->digest)
- #define EVP_MD_CTX_size(e)		EVP_MD_size((e)->digest)
+ const EVP_MD *EVP_MD_CTX_md(const EVP_MD_CTX *ctx);
+ #define EVP_MD_CTX_size(e)		EVP_MD_size(EVP_MD_CTX_md(e))
  #define EVP_MD_CTX_block_size(e)	EVP_MD_block_size((e)->digest)
  #define EVP_MD_CTX_type(e)		EVP_MD_type((e)->digest)
 
@@ -56,6 +56,11 @@ EVP digest routines
  const EVP_MD *EVP_mdc2(void);
  const EVP_MD *EVP_ripemd160(void);
 
+ const EVP_MD *EVP_sha224(void);
+ const EVP_MD *EVP_sha256(void);
+ const EVP_MD *EVP_sha384(void);
+ const EVP_MD *EVP_sha512(void);
+
  const EVP_MD *EVP_get_digestbyname(const char *name);
  #define EVP_get_digestbynid(a) EVP_get_digestbyname(OBJ_nid2sn(a))
  #define EVP_get_digestbyobj(a) EVP_get_digestbynid(OBJ_obj2nid(a))
@@ -124,12 +129,14 @@ B<EVP_MD_CTX>.
 
 EVP_MD_pkey_type() returns the NID of the public key signing algorithm associated
 with this digest. For example EVP_sha1() is associated with RSA so this will
-return B<NID_sha1WithRSAEncryption>. This "link" between digests and signature
-algorithms may not be retained in future versions of OpenSSL.
+return B<NID_sha1WithRSAEncryption>. Since digests and signature algorithms
+are no longer linked this function is only retained for compatibility
+reasons.
 
-EVP_md2(), EVP_md5(), EVP_sha(), EVP_sha1(), EVP_mdc2() and EVP_ripemd160()
-return B<EVP_MD> structures for the MD2, MD5, SHA, SHA1, MDC2 and RIPEMD160 digest
-algorithms respectively. The associated signature algorithm is RSA in each case.
+EVP_md2(), EVP_md5(), EVP_sha(), EVP_sha1(), EVP_sha224(), EVP_sha256(),
+EVP_sha384(), EVP_sha512(), EVP_mdc2() and EVP_ripemd160() return B<EVP_MD>
+structures for the MD2, MD5, SHA, SHA1, SHA224, SHA256, SHA384, SHA512, MDC2
+and RIPEMD160 digest algorithms respectively. 
 
 EVP_dss() and EVP_dss1() return B<EVP_MD> structures for SHA and SHA1 digest
 algorithms but using DSS (DSA) for the signature algorithm. Note: there is 
@@ -171,8 +178,8 @@ The B<EVP> interface to message digests should almost always be used in
 preference to the low level interfaces. This is because the code then becomes
 transparent to the digest used and much more flexible.
 
-SHA1 is the digest of choice for new applications. The other digest algorithms
-are still in common use.
+New applications should use the SHA2 digest algorithms such as SHA256. 
+The other digest algorithms are still in common use.
 
 For most applications the B<impl> parameter to EVP_DigestInit_ex() will be
 set to NULL to use the default digest implementation.
@@ -187,6 +194,19 @@ implementations of digests to be specified.
 In OpenSSL 0.9.7 and later if digest contexts are not cleaned up after use
 memory leaks will occur. 
 
+Stack allocation of EVP_MD_CTX structures is common, for example:
+
+ EVP_MD_CTX mctx;
+ EVP_MD_CTX_init(&mctx);
+
+This will cause binary compatibility issues if the size of EVP_MD_CTX
+structure changes (this will only happen with a major release of OpenSSL).
+Applications wishing to avoid this should use EVP_MD_CTX_create() instead:
+
+ EVP_MD_CTX *mctx;
+ mctx = EVP_MD_CTX_create();
+
+
 =head1 EXAMPLE
 
 This example digests the data "Test Message\n" and "Hello World\n", using the
@@ -197,7 +217,7 @@ digest name passed on the command line.
 
  main(int argc, char *argv[])
  {
- EVP_MD_CTX mdctx;
+ EVP_MD_CTX *mdctx;
  const EVP_MD *md;
  char mess1[] = "Test Message\n";
  char mess2[] = "Hello World\n";
@@ -218,12 +238,12 @@ digest name passed on the command line.
 	exit(1);
  }
 
- EVP_MD_CTX_init(&mdctx);
- EVP_DigestInit_ex(&mdctx, md, NULL);
- EVP_DigestUpdate(&mdctx, mess1, strlen(mess1));
- EVP_DigestUpdate(&mdctx, mess2, strlen(mess2));
- EVP_DigestFinal_ex(&mdctx, md_value, &md_len);
- EVP_MD_CTX_cleanup(&mdctx);
+ mdctx = EVP_MD_CTX_create();
+ EVP_DigestInit_ex(mdctx, md, NULL);
+ EVP_DigestUpdate(mdctx, mess1, strlen(mess1));
+ EVP_DigestUpdate(mdctx, mess2, strlen(mess2));
+ EVP_DigestFinal_ex(mdctx, md_value, &md_len);
+ EVP_MD_CTX_destroy(mdctx);
 
  printf("Digest is: ");
  for(i = 0; i < md_len; i++) printf("%02x", md_value[i]);
diff --git a/src/lib/libcrypto/dsa/dsa.h b/src/lib/libcrypto/dsa/dsa.h
index ac50a5c846..a6f6d0b0b2 100644
--- a/src/lib/libcrypto/dsa/dsa.h
+++ b/src/lib/libcrypto/dsa/dsa.h
@@ -97,6 +97,21 @@
                                               * be used for all exponents.
                                               */
 
+/* If this flag is set the DSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its reposibility
+ * to ensure the result is compliant.
+ */
+
+#define DSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define DSA_FLAG_NON_FIPS_ALLOW			0x0400
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -272,6 +287,8 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSAPARAMS_PRINT_FP			 101
 #define DSA_F_DSA_DO_SIGN				 112
 #define DSA_F_DSA_DO_VERIFY				 113
+#define DSA_F_DSA_GENERATE_KEY				 124
+#define DSA_F_DSA_GENERATE_PARAMETERS_EX		 123
 #define DSA_F_DSA_NEW_METHOD				 103
 #define DSA_F_DSA_PARAM_DECODE				 119
 #define DSA_F_DSA_PRINT_FP				 105
@@ -282,6 +299,7 @@ void ERR_load_DSA_strings(void);
 #define DSA_F_DSA_SIGN					 106
 #define DSA_F_DSA_SIGN_SETUP				 107
 #define DSA_F_DSA_SIG_NEW				 109
+#define DSA_F_DSA_SIG_PRINT				 125
 #define DSA_F_DSA_VERIFY				 108
 #define DSA_F_I2D_DSA_SIG				 111
 #define DSA_F_OLD_DSA_PRIV_DECODE			 122
@@ -298,6 +316,8 @@ void ERR_load_DSA_strings(void);
 #define DSA_R_INVALID_DIGEST_TYPE			 106
 #define DSA_R_MISSING_PARAMETERS			 101
 #define DSA_R_MODULUS_TOO_LARGE				 103
+#define DSA_R_NEED_NEW_SETUP_VALUES			 110
+#define DSA_R_NON_FIPS_DSA_METHOD			 111
 #define DSA_R_NO_PARAMETERS_SET				 107
 #define DSA_R_PARAMETER_ENCODING_ERROR			 105
 
diff --git a/src/lib/libcrypto/dsa/dsa_ameth.c b/src/lib/libcrypto/dsa/dsa_ameth.c
index 6413aae46e..376156ec5e 100644
--- a/src/lib/libcrypto/dsa/dsa_ameth.c
+++ b/src/lib/libcrypto/dsa/dsa_ameth.c
@@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder)
 	return i2d_DSAPrivateKey(pkey->pkey.dsa, pder);
 	}
 
+static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	DSA_SIG *dsa_sig;
+	const unsigned char *p;
+	if (!sig)
+		{
+		if (BIO_puts(bp, "\n") <= 0)
+			return 0;
+		else
+			return 1;
+		}
+	p = sig->data;
+	dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length);
+	if (dsa_sig)
+		{
+		int rv = 0;
+		size_t buf_len = 0;
+		unsigned char *m=NULL;
+		update_buflen(dsa_sig->r, &buf_len);
+		update_buflen(dsa_sig->s, &buf_len);
+		m = OPENSSL_malloc(buf_len+10);
+		if (m == NULL)
+			{
+			DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+
+		if (BIO_write(bp, "\n", 1) != 1)
+			goto err;
+
+		if (!ASN1_bn_print(bp,"r:   ",dsa_sig->r,m,indent))
+			goto err;
+		if (!ASN1_bn_print(bp,"s:   ",dsa_sig->s,m,indent))
+			goto err;
+		rv = 1;
+		err:
+		if (m)
+			OPENSSL_free(m);
+		DSA_SIG_free(dsa_sig);
+		return rv;
+		}
+	return X509_signature_dump(bp, sig, indent);
+	}
+
 static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
 	switch (op)
@@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] =
 		dsa_copy_parameters,
 		dsa_cmp_parameters,
 		dsa_param_print,
+		dsa_sig_print,
 
 		int_dsa_free,
 		dsa_pkey_ctrl,
diff --git a/src/lib/libcrypto/dsa/dsa_asn1.c b/src/lib/libcrypto/dsa/dsa_asn1.c
index c37460b2d6..6058534374 100644
--- a/src/lib/libcrypto/dsa/dsa_asn1.c
+++ b/src/lib/libcrypto/dsa/dsa_asn1.c
@@ -61,6 +61,7 @@
 #include <openssl/dsa.h>
 #include <openssl/asn1.h>
 #include <openssl/asn1t.h>
+#include <openssl/rand.h>
 
 /* Override the default new methods */
 static int sig_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -87,7 +88,7 @@ ASN1_SEQUENCE_cb(DSA_SIG, sig_cb) = {
 	ASN1_SIMPLE(DSA_SIG, s, CBIGNUM)
 } ASN1_SEQUENCE_END_cb(DSA_SIG, DSA_SIG)
 
-IMPLEMENT_ASN1_FUNCTIONS_const(DSA_SIG)
+IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DSA_SIG, DSA_SIG, DSA_SIG)
 
 /* Override the default free and new methods */
 static int dsa_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it,
@@ -148,3 +149,40 @@ DSA *DSAparams_dup(DSA *dsa)
 	{
 	return ASN1_item_dup(ASN1_ITEM_rptr(DSAparams), dsa);
 	}
+
+int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
+	     unsigned int *siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	RAND_seed(dgst, dlen);
+	s=DSA_do_sign(dgst,dlen,dsa);
+	if (s == NULL)
+		{
+		*siglen=0;
+		return(0);
+		}
+	*siglen=i2d_DSA_SIG(s,&sig);
+	DSA_SIG_free(s);
+	return(1);
+	}
+
+/* data has already been hashed (probably with SHA or SHA-1). */
+/* returns
+ *      1: correct signature
+ *      0: incorrect signature
+ *     -1: error
+ */
+int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
+	     const unsigned char *sigbuf, int siglen, DSA *dsa)
+	{
+	DSA_SIG *s;
+	int ret=-1;
+
+	s = DSA_SIG_new();
+	if (s == NULL) return(ret);
+	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
+	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
+err:
+	DSA_SIG_free(s);
+	return(ret);
+	}
diff --git a/src/lib/libcrypto/dsa/dsa_err.c b/src/lib/libcrypto/dsa/dsa_err.c
index bba984e92e..00545b7b9f 100644
--- a/src/lib/libcrypto/dsa/dsa_err.c
+++ b/src/lib/libcrypto/dsa/dsa_err.c
@@ -1,6 +1,6 @@
 /* crypto/dsa/dsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -76,6 +76,8 @@ static ERR_STRING_DATA DSA_str_functs[]=
 {ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP),	"DSAparams_print_fp"},
 {ERR_FUNC(DSA_F_DSA_DO_SIGN),	"DSA_do_sign"},
 {ERR_FUNC(DSA_F_DSA_DO_VERIFY),	"DSA_do_verify"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_KEY),	"DSA_generate_key"},
+{ERR_FUNC(DSA_F_DSA_GENERATE_PARAMETERS_EX),	"DSA_generate_parameters_ex"},
 {ERR_FUNC(DSA_F_DSA_NEW_METHOD),	"DSA_new_method"},
 {ERR_FUNC(DSA_F_DSA_PARAM_DECODE),	"DSA_PARAM_DECODE"},
 {ERR_FUNC(DSA_F_DSA_PRINT_FP),	"DSA_print_fp"},
@@ -86,6 +88,7 @@ static ERR_STRING_DATA DSA_str_functs[]=
 {ERR_FUNC(DSA_F_DSA_SIGN),	"DSA_sign"},
 {ERR_FUNC(DSA_F_DSA_SIGN_SETUP),	"DSA_sign_setup"},
 {ERR_FUNC(DSA_F_DSA_SIG_NEW),	"DSA_SIG_new"},
+{ERR_FUNC(DSA_F_DSA_SIG_PRINT),	"DSA_SIG_PRINT"},
 {ERR_FUNC(DSA_F_DSA_VERIFY),	"DSA_verify"},
 {ERR_FUNC(DSA_F_I2D_DSA_SIG),	"i2d_DSA_SIG"},
 {ERR_FUNC(DSA_F_OLD_DSA_PRIV_DECODE),	"OLD_DSA_PRIV_DECODE"},
@@ -105,6 +108,8 @@ static ERR_STRING_DATA DSA_str_reasons[]=
 {ERR_REASON(DSA_R_INVALID_DIGEST_TYPE)   ,"invalid digest type"},
 {ERR_REASON(DSA_R_MISSING_PARAMETERS)    ,"missing parameters"},
 {ERR_REASON(DSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES) ,"need new setup values"},
+{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD)   ,"non fips dsa method"},
 {ERR_REASON(DSA_R_NO_PARAMETERS_SET)     ,"no parameters set"},
 {ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"},
 {0,NULL}
diff --git a/src/lib/libcrypto/dsa/dsa_gen.c b/src/lib/libcrypto/dsa/dsa_gen.c
index cb0b4538a4..c398761d0d 100644
--- a/src/lib/libcrypto/dsa/dsa_gen.c
+++ b/src/lib/libcrypto/dsa/dsa_gen.c
@@ -81,13 +81,33 @@
 #include <openssl/sha.h>
 #include "dsa_locl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int DSA_generate_parameters_ex(DSA *ret, int bits,
 		const unsigned char *seed_in, int seed_len,
 		int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ret->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(ret->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_PARAMETERS_EX, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(ret->meth->dsa_paramgen)
 		return ret->meth->dsa_paramgen(ret, bits, seed_in, seed_len,
 				counter_ret, h_ret, cb);
+#ifdef OPENSSL_FIPS
+	else if (FIPS_mode())
+		{
+		return FIPS_dsa_generate_parameters_ex(ret, bits, 
+							seed_in, seed_len,
+							counter_ret, h_ret, cb);
+		}
+#endif
 	else
 		{
 		const EVP_MD *evpmd;
@@ -105,12 +125,13 @@ int DSA_generate_parameters_ex(DSA *ret, int bits,
 			}
 
 		return dsa_builtin_paramgen(ret, bits, qbits, evpmd,
-				seed_in, seed_len, counter_ret, h_ret, cb);
+			seed_in, seed_len, NULL, counter_ret, h_ret, cb);
 		}
 	}
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb)
 	{
 	int ok=0;
@@ -201,8 +222,10 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 				}
 
 			/* step 2 */
-			EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL);
-			EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL);
+			if (!EVP_Digest(seed, qsize, md,   NULL, evpmd, NULL))
+				goto err;
+			if (!EVP_Digest(buf,  qsize, buf2, NULL, evpmd, NULL))
+				goto err;
 			for (i = 0; i < qsize; i++)
 				md[i]^=buf2[i];
 
@@ -251,7 +274,9 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 						break;
 					}
 
-				EVP_Digest(buf, qsize, md ,NULL, evpmd, NULL);
+				if (!EVP_Digest(buf, qsize, md ,NULL, evpmd,
+									NULL))
+					goto err;
 
 				/* step 8 */
 				if (!BN_bin2bn(md, qsize, r0))
@@ -332,6 +357,8 @@ err:
 			}
 		if (counter_ret != NULL) *counter_ret=counter;
 		if (h_ret != NULL) *h_ret=h;
+		if (seed_out)
+			memcpy(seed_out, seed, qsize);
 		}
 	if(ctx)
 		{
diff --git a/src/lib/libcrypto/dsa/dsa_key.c b/src/lib/libcrypto/dsa/dsa_key.c
index c4aa86bc6d..9cf669b921 100644
--- a/src/lib/libcrypto/dsa/dsa_key.c
+++ b/src/lib/libcrypto/dsa/dsa_key.c
@@ -64,12 +64,28 @@
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 static int dsa_builtin_keygen(DSA *dsa);
 
 int DSA_generate_key(DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_GENERATE_KEY, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
+		}
+#endif
 	if(dsa->meth->dsa_keygen)
 		return dsa->meth->dsa_keygen(dsa);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_dsa_generate_key(dsa);
+#endif
 	return dsa_builtin_keygen(dsa);
 	}
 
diff --git a/src/lib/libcrypto/dsa/dsa_lib.c b/src/lib/libcrypto/dsa/dsa_lib.c
index e9b75902db..96d8d0c4b4 100644
--- a/src/lib/libcrypto/dsa/dsa_lib.c
+++ b/src/lib/libcrypto/dsa/dsa_lib.c
@@ -70,6 +70,10 @@
 #include <openssl/dh.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char DSA_version[]="DSA" OPENSSL_VERSION_PTEXT;
 
 static const DSA_METHOD *default_DSA_method = NULL;
@@ -82,7 +86,16 @@ void DSA_set_default_method(const DSA_METHOD *meth)
 const DSA_METHOD *DSA_get_default_method(void)
 	{
 	if(!default_DSA_method)
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_dsa_openssl();
+		else
+			return DSA_OpenSSL();
+#else
 		default_DSA_method = DSA_OpenSSL();
+#endif
+		}
 	return default_DSA_method;
 	}
 
@@ -163,7 +176,7 @@ DSA *DSA_new_method(ENGINE *engine)
 	ret->method_mont_p=NULL;
 
 	ret->references=1;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~DSA_FLAG_NON_FIPS_ALLOW;
 	CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DSA, ret, &ret->ex_data);
 	if ((ret->meth->init != NULL) && !ret->meth->init(ret))
 		{
@@ -276,7 +289,8 @@ void *DSA_get_ex_data(DSA *d, int idx)
 DH *DSA_dup_DH(const DSA *r)
 	{
 	/* DSA has p, q, g, optional pub_key, optional priv_key.
-	 * DH has p, optional length, g, optional pub_key, optional priv_key.
+	 * DH has p, optional length, g, optional pub_key, optional priv_key,
+	 * optional q.
 	 */ 
 
 	DH *ret = NULL;
@@ -290,7 +304,11 @@ DH *DSA_dup_DH(const DSA *r)
 		if ((ret->p = BN_dup(r->p)) == NULL)
 			goto err;
 	if (r->q != NULL)
+		{
 		ret->length = BN_num_bits(r->q);
+		if ((ret->q = BN_dup(r->q)) == NULL)
+			goto err;
+		}
 	if (r->g != NULL)
 		if ((ret->g = BN_dup(r->g)) == NULL)
 			goto err;
diff --git a/src/lib/libcrypto/dsa/dsa_locl.h b/src/lib/libcrypto/dsa/dsa_locl.h
index 2b8cfee3db..21e2e45242 100644
--- a/src/lib/libcrypto/dsa/dsa_locl.h
+++ b/src/lib/libcrypto/dsa/dsa_locl.h
@@ -56,4 +56,5 @@
 
 int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits,
 	const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len,
+	unsigned char *seed_out,
 	int *counter_ret, unsigned long *h_ret, BN_GENCB *cb);
diff --git a/src/lib/libcrypto/dsa/dsa_ossl.c b/src/lib/libcrypto/dsa/dsa_ossl.c
index a3ddd7d281..b3d78e524c 100644
--- a/src/lib/libcrypto/dsa/dsa_ossl.c
+++ b/src/lib/libcrypto/dsa/dsa_ossl.c
@@ -136,6 +136,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	BN_CTX *ctx=NULL;
 	int reason=ERR_R_BN_LIB;
 	DSA_SIG *ret=NULL;
+	int noredo = 0;
 
 	BN_init(&m);
 	BN_init(&xr);
@@ -150,7 +151,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	if (s == NULL) goto err;
 	ctx=BN_CTX_new();
 	if (ctx == NULL) goto err;
-
+redo:
 	if ((dsa->kinv == NULL) || (dsa->r == NULL))
 		{
 		if (!DSA_sign_setup(dsa,ctx,&kinv,&r)) goto err;
@@ -161,6 +162,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 		dsa->kinv=NULL;
 		r=dsa->r;
 		dsa->r=NULL;
+		noredo = 1;
 		}
 
 	
@@ -181,6 +183,18 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 
 	ret=DSA_SIG_new();
 	if (ret == NULL) goto err;
+	/* Redo if r or s is zero as required by FIPS 186-3: this is
+	 * very unlikely.
+	 */
+	if (BN_is_zero(r) || BN_is_zero(s))
+		{
+		if (noredo)
+			{
+			reason = DSA_R_NEED_NEW_SETUP_VALUES;
+			goto err;
+			}
+		goto redo;
+		}
 	ret->r = r;
 	ret->s = s;
 	
diff --git a/src/lib/libcrypto/dsa/dsa_pmeth.c b/src/lib/libcrypto/dsa/dsa_pmeth.c
index e2df54fec6..715d8d675b 100644
--- a/src/lib/libcrypto/dsa/dsa_pmeth.c
+++ b/src/lib/libcrypto/dsa/dsa_pmeth.c
@@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsa    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA    &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
-		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256)
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_sha512)
 			{
 			DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE);
 			return 0;
@@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
 	if (!dsa)
 		return 0;
 	ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd,
-	                           NULL, 0, NULL, NULL, pcb);
+	                           NULL, 0, NULL, NULL, NULL, pcb);
 	if (ret)
 		EVP_PKEY_assign_DSA(pkey, dsa);
 	else
diff --git a/src/lib/libcrypto/dsa/dsa_sign.c b/src/lib/libcrypto/dsa/dsa_sign.c
index 17555e5892..c3cc3642ce 100644
--- a/src/lib/libcrypto/dsa/dsa_sign.c
+++ b/src/lib/libcrypto/dsa/dsa_sign.c
@@ -61,30 +61,54 @@
 #include "cryptlib.h"
 #include <openssl/dsa.h>
 #include <openssl/rand.h>
+#include <openssl/bn.h>
 
 DSA_SIG * DSA_do_sign(const unsigned char *dgst, int dlen, DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_DO_SIGN, DSA_R_NON_FIPS_DSA_METHOD);
+		return NULL;
+		}
+#endif
 	return dsa->meth->dsa_do_sign(dgst, dlen, dsa);
 	}
 
-int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig,
-	     unsigned int *siglen, DSA *dsa)
+int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
 	{
-	DSA_SIG *s;
-	RAND_seed(dgst, dlen);
-	s=DSA_do_sign(dgst,dlen,dsa);
-	if (s == NULL)
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
 		{
-		*siglen=0;
-		return(0);
+		DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NON_FIPS_DSA_METHOD);
+		return 0;
 		}
-	*siglen=i2d_DSA_SIG(s,&sig);
-	DSA_SIG_free(s);
-	return(1);
+#endif
+	return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
 	}
 
-int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp)
+DSA_SIG *DSA_SIG_new(void)
 	{
-	return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp);
+	DSA_SIG *sig;
+	sig = OPENSSL_malloc(sizeof(DSA_SIG));
+	if (!sig)
+		return NULL;
+	sig->r = NULL;
+	sig->s = NULL;
+	return sig;
+	}
+
+void DSA_SIG_free(DSA_SIG *sig)
+	{
+	if (sig)
+		{
+		if (sig->r)
+			BN_free(sig->r);
+		if (sig->s)
+			BN_free(sig->s);
+		OPENSSL_free(sig);
+		}
 	}
 
diff --git a/src/lib/libcrypto/dsa/dsa_vrf.c b/src/lib/libcrypto/dsa/dsa_vrf.c
index 226a75ff3f..674cb5fa5f 100644
--- a/src/lib/libcrypto/dsa/dsa_vrf.c
+++ b/src/lib/libcrypto/dsa/dsa_vrf.c
@@ -64,26 +64,13 @@
 int DSA_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig,
 		  DSA *dsa)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD)
+			&& !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW))
+		{
+		DSAerr(DSA_F_DSA_DO_VERIFY, DSA_R_NON_FIPS_DSA_METHOD);
+		return -1;
+		}
+#endif
 	return dsa->meth->dsa_do_verify(dgst, dgst_len, sig, dsa);
 	}
-
-/* data has already been hashed (probably with SHA or SHA-1). */
-/* returns
- *      1: correct signature
- *      0: incorrect signature
- *     -1: error
- */
-int DSA_verify(int type, const unsigned char *dgst, int dgst_len,
-	     const unsigned char *sigbuf, int siglen, DSA *dsa)
-	{
-	DSA_SIG *s;
-	int ret=-1;
-
-	s = DSA_SIG_new();
-	if (s == NULL) return(ret);
-	if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err;
-	ret=DSA_do_verify(dgst,dgst_len,s,dsa);
-err:
-	DSA_SIG_free(s);
-	return(ret);
-	}
diff --git a/src/lib/libcrypto/dso/dso_dlfcn.c b/src/lib/libcrypto/dso/dso_dlfcn.c
index c2bc61760b..5f2254806c 100644
--- a/src/lib/libcrypto/dso/dso_dlfcn.c
+++ b/src/lib/libcrypto/dso/dso_dlfcn.c
@@ -86,7 +86,8 @@ DSO_METHOD *DSO_METHOD_dlfcn(void)
 # if defined(_AIX) || defined(__CYGWIN__) || \
      defined(__SCO_VERSION__) || defined(_SCO_ELF) || \
      (defined(__osf__) && !defined(RTLD_NEXT))     || \
-     (defined(__OpenBSD__) && !defined(RTLD_SELF))
+     (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \
+	defined(__ANDROID__)
 #  undef HAVE_DLINFO
 # endif
 #endif
diff --git a/src/lib/libcrypto/ec/ec.h b/src/lib/libcrypto/ec/ec.h
index ee7078130c..9d01325af3 100644
--- a/src/lib/libcrypto/ec/ec.h
+++ b/src/lib/libcrypto/ec/ec.h
@@ -151,7 +151,24 @@ const EC_METHOD *EC_GFp_mont_method(void);
  */
 const EC_METHOD *EC_GFp_nist_method(void);
 
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+/** Returns 64-bit optimized methods for nistp224
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp224_method(void);
+
+/** Returns 64-bit optimized methods for nistp256
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp256_method(void);
+
+/** Returns 64-bit optimized methods for nistp521
+ *  \return  EC_METHOD object
+ */
+const EC_METHOD *EC_GFp_nistp521_method(void);
+#endif
 
+#ifndef OPENSSL_NO_EC2M
 /********************************************************************/ 
 /*           EC_METHOD for curves over GF(2^m)                      */
 /********************************************************************/
@@ -161,6 +178,8 @@ const EC_METHOD *EC_GFp_nist_method(void);
  */
 const EC_METHOD *EC_GF2m_simple_method(void);
 
+#endif
+
 
 /********************************************************************/
 /*                   EC_GROUP functions                             */
@@ -282,6 +301,7 @@ int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, co
  */
 int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
 
+#ifndef OPENSSL_NO_EC2M
 /** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b
  *  \param  group  EC_GROUP object
  *  \param  p      BIGNUM with the polynomial defining the underlying field
@@ -301,7 +321,7 @@ int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, c
  *  \return 1 on success and 0 if an error occured
  */
 int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Returns the number of bits needed to represent a field element 
  *  \param  group  EC_GROUP object
  *  \return number of bits needed to represent a field element
@@ -342,7 +362,7 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx);
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Creates a new EC_GROUP object with the specified parameters defined
  *  over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b)
  *  \param  p    BIGNUM with the polynomial defining the underlying field
@@ -352,7 +372,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
  *  \return newly created EC_GROUP object with the specified parameters
  */
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
-
+#endif
 /** Creates a EC_GROUP object with a curve specified by a NID
  *  \param  nid  NID of the OID of the curve name
  *  \return newly created EC_GROUP object with specified curve or NULL
@@ -481,7 +501,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#ifndef OPENSSL_NO_EC2M
 /** Sets the affine coordinates of a EC_POINT over GF2m
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -514,7 +534,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group,
  */
 int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p,
 	const BIGNUM *x, int y_bit, BN_CTX *ctx);
-
+#endif
 /** Encodes a EC_POINT object to a octet string
  *  \param  group  underlying EC_GROUP object
  *  \param  p      EC_POINT object
@@ -653,9 +673,11 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group);
 /* EC_GROUP_get_basis_type() returns the NID of the basis type
  * used to represent the field elements */
 int EC_GROUP_get_basis_type(const EC_GROUP *);
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k);
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, 
 	unsigned int *k2, unsigned int *k3);
+#endif
 
 #define OPENSSL_EC_NAMED_CURVE	0x001
 
@@ -689,11 +711,21 @@ typedef struct ec_key_st EC_KEY;
 #define EC_PKEY_NO_PARAMETERS	0x001
 #define EC_PKEY_NO_PUBKEY	0x002
 
+/* some values for the flags field */
+#define EC_FLAG_NON_FIPS_ALLOW	0x1
+#define EC_FLAG_FIPS_CHECKED	0x2
+
 /** Creates a new EC_KEY object.
  *  \return EC_KEY object or NULL if an error occurred.
  */
 EC_KEY *EC_KEY_new(void);
 
+int EC_KEY_get_flags(const EC_KEY *key);
+
+void EC_KEY_set_flags(EC_KEY *key, int flags);
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags);
+
 /** Creates a new EC_KEY object using a named curve as underlying
  *  EC_GROUP object.
  *  \param  nid  NID of the named curve.
@@ -799,6 +831,15 @@ int EC_KEY_generate_key(EC_KEY *key);
  */
 int EC_KEY_check_key(const EC_KEY *key);
 
+/** Sets a public key from affine coordindates performing
+ *  neccessary NIST PKV tests.
+ *  \param  key  the EC_KEY object
+ *  \param  x    public key x coordinate
+ *  \param  y    public key y coordinate
+ *  \return 1 on success and 0 otherwise.
+ */
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y);
+
 
 /********************************************************************/
 /*        de- and encoding functions for SEC1 ECPrivateKey          */
@@ -926,6 +967,7 @@ void ERR_load_EC_strings(void);
 /* Error codes for the EC functions. */
 
 /* Function codes. */
+#define EC_F_BN_TO_FELEM				 224
 #define EC_F_COMPUTE_WNAF				 143
 #define EC_F_D2I_ECPARAMETERS				 144
 #define EC_F_D2I_ECPKPARAMETERS				 145
@@ -968,6 +1010,15 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_GFP_MONT_FIELD_SQR			 132
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE		 189
 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP		 135
+#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE		 225
+#define EC_F_EC_GFP_NISTP224_POINTS_MUL			 228
+#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226
+#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE		 230
+#define EC_F_EC_GFP_NISTP256_POINTS_MUL			 231
+#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232
+#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE		 233
+#define EC_F_EC_GFP_NISTP521_POINTS_MUL			 234
+#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235
 #define EC_F_EC_GFP_NIST_FIELD_MUL			 200
 #define EC_F_EC_GFP_NIST_FIELD_SQR			 201
 #define EC_F_EC_GFP_NIST_GROUP_SET_CURVE		 202
@@ -1010,6 +1061,7 @@ void ERR_load_EC_strings(void);
 #define EC_F_EC_KEY_NEW					 182
 #define EC_F_EC_KEY_PRINT				 180
 #define EC_F_EC_KEY_PRINT_FP				 181
+#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES	 229
 #define EC_F_EC_POINTS_MAKE_AFFINE			 136
 #define EC_F_EC_POINT_ADD				 112
 #define EC_F_EC_POINT_CMP				 113
@@ -1040,6 +1092,9 @@ void ERR_load_EC_strings(void);
 #define EC_F_I2D_ECPKPARAMETERS				 191
 #define EC_F_I2D_ECPRIVATEKEY				 192
 #define EC_F_I2O_ECPUBLICKEY				 151
+#define EC_F_NISTP224_PRE_COMP_NEW			 227
+#define EC_F_NISTP256_PRE_COMP_NEW			 236
+#define EC_F_NISTP521_PRE_COMP_NEW			 237
 #define EC_F_O2I_ECPUBLICKEY				 152
 #define EC_F_OLD_EC_PRIV_DECODE				 222
 #define EC_F_PKEY_EC_CTRL				 197
@@ -1052,12 +1107,15 @@ void ERR_load_EC_strings(void);
 /* Reason codes. */
 #define EC_R_ASN1_ERROR					 115
 #define EC_R_ASN1_UNKNOWN_FIELD				 116
+#define EC_R_BIGNUM_OUT_OF_RANGE			 144
 #define EC_R_BUFFER_TOO_SMALL				 100
+#define EC_R_COORDINATES_OUT_OF_RANGE			 146
 #define EC_R_D2I_ECPKPARAMETERS_FAILURE			 117
 #define EC_R_DECODE_ERROR				 142
 #define EC_R_DISCRIMINANT_IS_ZERO			 118
 #define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE		 119
 #define EC_R_FIELD_TOO_LARGE				 143
+#define EC_R_GF2M_NOT_SUPPORTED				 147
 #define EC_R_GROUP2PKPARAMETERS_FAILURE			 120
 #define EC_R_I2D_ECPKPARAMETERS_FAILURE			 121
 #define EC_R_INCOMPATIBLE_OBJECTS			 101
@@ -1092,6 +1150,7 @@ void ERR_load_EC_strings(void);
 #define EC_R_UNKNOWN_GROUP				 129
 #define EC_R_UNKNOWN_ORDER				 114
 #define EC_R_UNSUPPORTED_FIELD				 131
+#define EC_R_WRONG_CURVE_PARAMETERS			 145
 #define EC_R_WRONG_ORDER				 130
 
 #ifdef  __cplusplus
diff --git a/src/lib/libcrypto/ec/ec2_mult.c b/src/lib/libcrypto/ec/ec2_mult.c
index e12b9b284a..26f4a783fc 100644
--- a/src/lib/libcrypto/ec/ec2_mult.c
+++ b/src/lib/libcrypto/ec/ec2_mult.c
@@ -71,6 +71,8 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
 
 /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective 
  * coordinates.
@@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group)
 	{
 	return ec_wNAF_have_precompute_mult(group);
  	}
+
+#endif
diff --git a/src/lib/libcrypto/ec/ec2_oct.c b/src/lib/libcrypto/ec/ec2_oct.c
new file mode 100644
index 0000000000..f1d75e5ddf
--- /dev/null
+++ b/src/lib/libcrypto/ec/ec2_oct.c
@@ -0,0 +1,407 @@
+/* crypto/ec/ec2_oct.c */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * The Elliptic Curve Public-Key Crypto Library (ECC Code) included
+ * herein is developed by SUN MICROSYSTEMS, INC., and is contributed
+ * to the OpenSSL project.
+ *
+ * The ECC Code is licensed pursuant to the OpenSSL open source
+ * license provided below.
+ *
+ * The software is originally written by Sheueling Chang Shantz and
+ * Douglas Stebila of Sun Microsystems Laboratories.
+ *
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+#include <openssl/err.h>
+
+#include "ec_lcl.h"
+
+#ifndef OPENSSL_NO_EC2M
+
+/* Calculates and sets the affine coordinates of an EC_POINT from the given
+ * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
+ * Note that the simple implementation only uses affine coordinates.
+ *
+ * The method is from the following publication:
+ * 
+ *     Harper, Menezes, Vanstone:
+ *     "Public-Key Cryptosystems with Very Small Key Lengths",
+ *     EUROCRYPT '92, Springer-Verlag LNCS 658,
+ *     published February 1993
+ *
+ * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
+ * the same method, but claim no priority date earlier than July 29, 1994
+ * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
+ */
+int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp, *x, *y, *z;
+	int ret = 0, z0;
+
+	/* clear error queue */
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0) ? 1 : 0;
+
+	BN_CTX_start(ctx);
+	tmp = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	z = BN_CTX_get(ctx);
+	if (z == NULL) goto err;
+
+	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
+	if (BN_is_zero(x))
+		{
+		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
+		}
+	else
+		{
+		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
+		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
+		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
+		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
+		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
+			{
+			unsigned long err = ERR_peek_last_error();
+			
+			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
+				{
+				ERR_clear_error();
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+				}
+			else
+				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+			goto err;
+			}
+		z0 = (BN_is_odd(z)) ? 1 : 0;
+		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
+		if (z0 != y_bit)
+			{
+			if (!BN_GF2m_add(y, y, x)) goto err;
+			}
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+/* Converts an EC_POINT to an octet string.  
+ * If buf is NULL, the encoded length will be returned.
+ * If the length len of buf is smaller than required an error will be returned.
+ */
+size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		yxi = BN_CTX_get(ctx);
+		if (yxi == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+
+		buf[0] = form;
+		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (BN_is_odd(yxi)) buf[0]++;
+			}
+
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+/* Converts an octet string representation to an EC_POINT. 
+ * Note that the simple implementation only uses affine coordinates.
+ */
+int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y, *yxi;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	yxi = BN_CTX_get(ctx);
+	if (yxi == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
+			if (y_bit != BN_is_odd(yxi))
+				{
+				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+#endif
diff --git a/src/lib/libcrypto/ec/ec2_smpl.c b/src/lib/libcrypto/ec/ec2_smpl.c
index 03deae6674..e0e59c7d82 100644
--- a/src/lib/libcrypto/ec/ec2_smpl.c
+++ b/src/lib/libcrypto/ec/ec2_smpl.c
@@ -71,10 +71,20 @@
 
 #include "ec_lcl.h"
 
+#ifndef OPENSSL_NO_EC2M
+
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 
 const EC_METHOD *EC_GF2m_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gf2m_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_characteristic_two_field,
 		ec_GF2m_simple_group_init,
 		ec_GF2m_simple_group_finish,
@@ -93,9 +103,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
 		0 /* get_Jprojective_coordinates_GFp */,
 		ec_GF2m_simple_point_set_affine_coordinates,
 		ec_GF2m_simple_point_get_affine_coordinates,
-		ec_GF2m_simple_set_compressed_coordinates,
-		ec_GF2m_simple_point2oct,
-		ec_GF2m_simple_oct2point,
+		0,0,0,
 		ec_GF2m_simple_add,
 		ec_GF2m_simple_dbl,
 		ec_GF2m_simple_invert,
@@ -118,6 +126,7 @@ const EC_METHOD *EC_GF2m_simple_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -405,340 +414,6 @@ int ec_GF2m_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_
 	return ret;
 	}
 
-
-/* Calculates and sets the affine coordinates of an EC_POINT from the given
- * compressed coordinates.  Uses algorithm 2.3.4 of SEC 1. 
- * Note that the simple implementation only uses affine coordinates.
- *
- * The method is from the following publication:
- * 
- *     Harper, Menezes, Vanstone:
- *     "Public-Key Cryptosystems with Very Small Key Lengths",
- *     EUROCRYPT '92, Springer-Verlag LNCS 658,
- *     published February 1993
- *
- * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe
- * the same method, but claim no priority date earlier than July 29, 1994
- * (and additionally fail to cite the EUROCRYPT '92 publication as prior art).
- */
-int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp, *x, *y, *z;
-	int ret = 0, z0;
-
-	/* clear error queue */
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0) ? 1 : 0;
-
-	BN_CTX_start(ctx);
-	tmp = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	z = BN_CTX_get(ctx);
-	if (z == NULL) goto err;
-
-	if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err;
-	if (BN_is_zero(x))
-		{
-		if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err;
-		}
-	else
-		{
-		if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err;
-		if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err;
-		if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err;
-		if (!BN_GF2m_add(tmp, x, tmp)) goto err;
-		if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx))
-			{
-			unsigned long err = ERR_peek_last_error();
-			
-			if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION)
-				{
-				ERR_clear_error();
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-				}
-			else
-				ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-			goto err;
-			}
-		z0 = (BN_is_odd(z)) ? 1 : 0;
-		if (!group->meth->field_mul(group, y, x, z, ctx)) goto err;
-		if (z0 != y_bit)
-			{
-			if (!BN_GF2m_add(y, y, x)) goto err;
-			}
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-/* Converts an EC_POINT to an octet string.  
- * If buf is NULL, the encoded length will be returned.
- * If the length len of buf is smaller than required an error will be returned.
- */
-size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		yxi = BN_CTX_get(ctx);
-		if (yxi == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-
-		buf[0] = form;
-		if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x))
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (BN_is_odd(yxi)) buf[0]++;
-			}
-
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-/* Converts an octet string representation to an EC_POINT. 
- * Note that the simple implementation only uses affine coordinates.
- */
-int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y, *yxi;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = (EC_GROUP_get_degree(group) + 7) / 8;
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	yxi = BN_CTX_get(ctx);
-	if (yxi == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err;
-			if (y_bit != BN_is_odd(yxi))
-				{
-				ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 /* Computes a + b and stores the result in r.  r could be a or b, a could be b.
  * Uses algorithm A.10.2 of IEEE P1363.
  */
@@ -1040,3 +715,5 @@ int ec_GF2m_simple_field_div(const EC_GROUP *group, BIGNUM *r, const BIGNUM *a,
 	{
 	return BN_GF2m_mod_div(r, a, b, &group->field, ctx);
 	}
+
+#endif
diff --git a/src/lib/libcrypto/ec/ec_ameth.c b/src/lib/libcrypto/ec/ec_ameth.c
index c00f7d746c..83909c1853 100644
--- a/src/lib/libcrypto/ec/ec_ameth.c
+++ b/src/lib/libcrypto/ec/ec_ameth.c
@@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth =
 	ec_copy_parameters,
 	ec_cmp_parameters,
 	eckey_param_print,
+	0,
 
 	int_ec_free,
 	ec_pkey_ctrl,
diff --git a/src/lib/libcrypto/ec/ec_asn1.c b/src/lib/libcrypto/ec/ec_asn1.c
index ae55539859..175eec5342 100644
--- a/src/lib/libcrypto/ec/ec_asn1.c
+++ b/src/lib/libcrypto/ec/ec_asn1.c
@@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group)
 		/* everything else is currently not supported */
 		return 0;
 	}
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
 	{
 	if (group == NULL)
@@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k)
 
 	return 1;
 	}
-
 int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
 	unsigned int *k2, unsigned int *k3)
 	{
@@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1,
 
 	return 1;
 	}
-
+#endif
 
 
 /* some structures needed for the asn1 encoding */
@@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
 			}
 		}
 	else	/* nid == NID_X9_62_characteristic_two_field */
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		int		field_type;
 		X9_62_CHARACTERISTIC_TWO *char_two;
@@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field)
 				}
 			}
 		}
+#endif
 
 	ok = 1;
 
@@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* nid == NID_X9_62_characteristic_two_field */
 		{
 		if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL))
@@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve)
 			goto err;
 			}
 		}
-
+#endif
 	len_1 = (size_t)BN_num_bytes(tmp_1);
 	len_2 = (size_t)BN_num_bytes(tmp_2);
 
@@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
 
 	/* get the field parameters */
 	tmp = OBJ_obj2nid(params->fieldID->fieldType);
-
 	if (tmp == NID_X9_62_characteristic_two_field)
+#ifdef OPENSSL_NO_EC2M
+		{
+		ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED);
+		goto err;
+		}
+#else
 		{
 		X9_62_CHARACTERISTIC_TWO *char_two;
 
@@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params)
 		/* create the EC_GROUP structure */
 		ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL);
 		}
+#endif
 	else if (tmp == NID_X9_62_prime_field)
 		{
 		/* we have a curve over a prime field */
@@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len)
 	if ((group = ec_asn1_pkparameters2group(params)) == NULL)
 		{
 		ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE);
+		ECPKPARAMETERS_free(params);
 		return NULL; 
 		}
 
diff --git a/src/lib/libcrypto/ec/ec_curve.c b/src/lib/libcrypto/ec/ec_curve.c
index 23274e4031..c72fb2697c 100644
--- a/src/lib/libcrypto/ec/ec_curve.c
+++ b/src/lib/libcrypto/ec/ec_curve.c
@@ -3,7 +3,7 @@
  * Written by Nils Larsch for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -72,6 +72,7 @@
 #include "ec_lcl.h"
 #include <openssl/err.h>
 #include <openssl/obj_mac.h>
+#include <openssl/opensslconf.h>
 
 typedef struct {
 	int	field_type,	/* either NID_X9_62_prime_field or
@@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; }
 	  0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D }
 	};
 
+#ifndef OPENSSL_NO_EC2M
+
 /* characteristic two curves */
 static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; }
 	_EC_SECG_CHAR2_113R1 = {
@@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; }
 	{ 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76,	/* seed */
 	  0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD,
 
- 	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
+	  0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,	/* p */
 	  0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
 	  0x07,
 	  0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9,	/* a */
@@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; }
 	  0xBA,0xFC,0xA7,0x5E }
 	};
 
+#endif
+
 typedef struct _ec_list_element_st {
 	int	nid;
 	const EC_CURVE_DATA *data;
+	const EC_METHOD *(*meth)(void);
 	const char *comment;
 	} ec_list_element;
 
 static const ec_list_element curve_list[] = {
-	/* prime field curves */	
+	/* prime field curves */
 	/* secg curves */
-	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"},
-	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"},
-	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"},
-	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"},
+	{ NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" },
+	{ NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" },
+	{ NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" },
+	{ NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
 	/* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */
-	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"},
-	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"},
-	{ NID_secp224r1, &_EC_NIST_PRIME_224.h,   "NIST/SECG curve over a 224 bit prime field"},
-	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"},
+	{ NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" },
+	{ NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" },
+#else
+	{ NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" },
+#endif
+	{ NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" },
 	/* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */
-	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"},
-	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"},
+	{ NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" },
+#else
+	{ NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" },
+#endif
 	/* X9.62 curves */
-	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"},
-	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"},
-	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"},
+	{ NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" },
+	{ NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" },
+	{ NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" },
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" },
+#else
+	{ NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" },
+#endif
+#ifndef OPENSSL_NO_EC2M
 	/* characteristic two field curves */
 	/* NIST/SECG curves */
-	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"},
-	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"},
-	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"},
-	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h,  "NIST/SECG/WTLS curve over a 163 bit binary field" },
-	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"},
-	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h,  "NIST/SECG curve over a 163 bit binary field" },
-	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"},
-	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h,  "NIST/SECG/WTLS curve over a 233 bit binary field" },
-	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"},
-	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h,  "NIST/SECG curve over a 283 bit binary field" },
-	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h,  "NIST/SECG curve over a 409 bit binary field" },
-	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h,  "NIST/SECG curve over a 571 bit binary field" },
-	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h,  "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" },
+	{ NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" },
+	{ NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" },
+	{ NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" },
+	{ NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" },
+	{ NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" },
+	{ NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" },
+	{ NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" },
+	{ NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" },
+	{ NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" },
 	/* X9.62 curves */
-	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"},
-	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"},
-	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"},
-	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"},
-	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"},
-	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"},
-	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"},
-	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"},
-	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"},
+	{ NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" },
+	{ NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" },
+	{ NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" },
+	{ NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" },
+	{ NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" },
+	{ NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" },
+	{ NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" },
+	{ NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" },
+	{ NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" },
+	{ NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" },
 	/* the WAP/WTLS curves
 	 * [unlike SECG, spec has its own OIDs for curves from X9.62] */
-	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h,   "NIST/SECG/WTLS curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h,  "SECG curve over a 113 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h,  "SECG/WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h,  "SECG/WTLS curve over a 160 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"},
-	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" },
-	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"},
-	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"},
+	{ NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" },
+	{ NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
+	{ NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+	{ NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" },
+#endif
+	{ NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" },
+#ifndef OPENSSL_NO_EC2M
 	/* IPSec curves */
-	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
-	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"},
+	{ NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+	{ NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n"
+	  "\tNot suitable for ECDSA.\n\tQuestionable extension field!" },
+#endif
 };
 
 #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element))
 
-static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
+static EC_GROUP *ec_group_new_from_data(const ec_list_element curve)
 	{
 	EC_GROUP *group=NULL;
 	EC_POINT *P=NULL;
 	BN_CTX	 *ctx=NULL;
-	BIGNUM 	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
+	BIGNUM	 *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL;
 	int	 ok=0;
 	int	 seed_len,param_len;
+	const EC_METHOD *meth;
+	const EC_CURVE_DATA *data;
 	const unsigned char *params;
 
 	if ((ctx = BN_CTX_new()) == NULL)
@@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 		goto err;
 		}
 
+	data = curve.data;
 	seed_len  = data->seed_len;
 	param_len = data->param_len;
-	params    = (const unsigned char *)(data+1);	/* skip header */
-	params   += seed_len;				/* skip seed   */
+	params	  = (const unsigned char *)(data+1);	/* skip header */
+	params	 += seed_len;				/* skip seed   */
 
 	if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL))
 		|| !(a = BN_bin2bn(params+1*param_len, param_len, NULL))
@@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 		goto err;
 		}
 
-	if (data->field_type == NID_X9_62_prime_field)
+	if (curve.meth != 0)
+		{
+		meth = curve.meth();
+		if (((group = EC_GROUP_new(meth)) == NULL) ||
+			(!(group->meth->group_set_curve(group, p, a, b, ctx))))
+			{
+			ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
+			goto err;
+			}
+		}
+	else if (data->field_type == NID_X9_62_prime_field)
 		{
 		if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL)
 			{
@@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 			goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 	else	/* field_type == NID_X9_62_characteristic_two_field */
 		{
 		if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL)
@@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data)
 			goto err;
 			}
 		}
+#endif
 
 	if ((P = EC_POINT_new(group)) == NULL)
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
 		}
-	
+
 	if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL))
 		|| !(y = BN_bin2bn(params+4*param_len, param_len, NULL)))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB);
 		goto err;
 		}
-	if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx))
+	if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx))
 		{
 		ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB);
 		goto err;
@@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid)
 	for (i=0; i<curve_list_length; i++)
 		if (curve_list[i].nid == nid)
 			{
-			ret = ec_group_new_from_data(curve_list[i].data);
+			ret = ec_group_new_from_data(curve_list[i]);
 			break;
 			}
 
diff --git a/src/lib/libcrypto/ec/ec_cvt.c b/src/lib/libcrypto/ec/ec_cvt.c
index d45640bab9..bfcbab35fe 100644
--- a/src/lib/libcrypto/ec/ec_cvt.c
+++ b/src/lib/libcrypto/ec/ec_cvt.c
@@ -78,7 +78,32 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 	const EC_METHOD *meth;
 	EC_GROUP *ret;
 
+#if defined(OPENSSL_BN_ASM_MONT)
+	/*
+	 * This might appear controversial, but the fact is that generic
+	 * prime method was observed to deliver better performance even
+	 * for NIST primes on a range of platforms, e.g.: 60%-15%
+	 * improvement on IA-64, ~25% on ARM, 30%-90% on P4, 20%-25%
+	 * in 32-bit build and 35%--12% in 64-bit build on Core2...
+	 * Coefficients are relative to optimized bn_nist.c for most
+	 * intensive ECDSA verify and ECDH operations for 192- and 521-
+	 * bit keys respectively. Choice of these boundary values is
+	 * arguable, because the dependency of improvement coefficient
+	 * from key length is not a "monotone" curve. For example while
+	 * 571-bit result is 23% on ARM, 384-bit one is -1%. But it's
+	 * generally faster, sometimes "respectfully" faster, sometimes
+	 * "tolerably" slower... What effectively happens is that loop
+	 * with bn_mul_add_words is put against bn_mul_mont, and the
+	 * latter "wins" on short vectors. Correct solution should be
+	 * implementing dedicated NxN multiplication subroutines for
+	 * small N. But till it materializes, let's stick to generic
+	 * prime method...
+	 *						<appro>
+	 */
+	meth = EC_GFp_mont_method();
+#else
 	meth = EC_GFp_nist_method();
+#endif
 	
 	ret = EC_GROUP_new(meth);
 	if (ret == NULL)
@@ -122,7 +147,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 	return ret;
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	const EC_METHOD *meth;
@@ -142,3 +167,4 @@ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM
 
 	return ret;
 	}
+#endif
diff --git a/src/lib/libcrypto/ec/ec_err.c b/src/lib/libcrypto/ec/ec_err.c
index 84b4833371..0d19398731 100644
--- a/src/lib/libcrypto/ec/ec_err.c
+++ b/src/lib/libcrypto/ec/ec_err.c
@@ -1,6 +1,6 @@
 /* crypto/ec/ec_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2007 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA EC_str_functs[]=
 	{
+{ERR_FUNC(EC_F_BN_TO_FELEM),	"BN_TO_FELEM"},
 {ERR_FUNC(EC_F_COMPUTE_WNAF),	"COMPUTE_WNAF"},
 {ERR_FUNC(EC_F_D2I_ECPARAMETERS),	"d2i_ECParameters"},
 {ERR_FUNC(EC_F_D2I_ECPKPARAMETERS),	"d2i_ECPKParameters"},
@@ -112,6 +113,15 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_EC_GFP_MONT_FIELD_SQR),	"ec_GFp_mont_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE),	"ec_GFp_mont_group_set_curve"},
 {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP),	"EC_GFP_MONT_GROUP_SET_CURVE_GFP"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE),	"ec_GFp_nistp224_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINTS_MUL),	"ec_GFp_nistp224_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp224_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE),	"ec_GFp_nistp256_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINTS_MUL),	"ec_GFp_nistp256_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp256_point_get_affine_coordinates"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE),	"ec_GFp_nistp521_group_set_curve"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINTS_MUL),	"ec_GFp_nistp521_points_mul"},
+{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES),	"ec_GFp_nistp521_point_get_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_MUL),	"ec_GFp_nist_field_mul"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_SQR),	"ec_GFp_nist_field_sqr"},
 {ERR_FUNC(EC_F_EC_GFP_NIST_GROUP_SET_CURVE),	"ec_GFp_nist_group_set_curve"},
@@ -154,6 +164,7 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_EC_KEY_NEW),	"EC_KEY_new"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT),	"EC_KEY_print"},
 {ERR_FUNC(EC_F_EC_KEY_PRINT_FP),	"EC_KEY_print_fp"},
+{ERR_FUNC(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES),	"EC_KEY_set_public_key_affine_coordinates"},
 {ERR_FUNC(EC_F_EC_POINTS_MAKE_AFFINE),	"EC_POINTs_make_affine"},
 {ERR_FUNC(EC_F_EC_POINT_ADD),	"EC_POINT_add"},
 {ERR_FUNC(EC_F_EC_POINT_CMP),	"EC_POINT_cmp"},
@@ -184,6 +195,9 @@ static ERR_STRING_DATA EC_str_functs[]=
 {ERR_FUNC(EC_F_I2D_ECPKPARAMETERS),	"i2d_ECPKParameters"},
 {ERR_FUNC(EC_F_I2D_ECPRIVATEKEY),	"i2d_ECPrivateKey"},
 {ERR_FUNC(EC_F_I2O_ECPUBLICKEY),	"i2o_ECPublicKey"},
+{ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW),	"NISTP224_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW),	"NISTP256_PRE_COMP_NEW"},
+{ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW),	"NISTP521_PRE_COMP_NEW"},
 {ERR_FUNC(EC_F_O2I_ECPUBLICKEY),	"o2i_ECPublicKey"},
 {ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE),	"OLD_EC_PRIV_DECODE"},
 {ERR_FUNC(EC_F_PKEY_EC_CTRL),	"PKEY_EC_CTRL"},
@@ -199,12 +213,15 @@ static ERR_STRING_DATA EC_str_reasons[]=
 	{
 {ERR_REASON(EC_R_ASN1_ERROR)             ,"asn1 error"},
 {ERR_REASON(EC_R_ASN1_UNKNOWN_FIELD)     ,"asn1 unknown field"},
+{ERR_REASON(EC_R_BIGNUM_OUT_OF_RANGE)    ,"bignum out of range"},
 {ERR_REASON(EC_R_BUFFER_TOO_SMALL)       ,"buffer too small"},
+{ERR_REASON(EC_R_COORDINATES_OUT_OF_RANGE),"coordinates out of range"},
 {ERR_REASON(EC_R_D2I_ECPKPARAMETERS_FAILURE),"d2i ecpkparameters failure"},
 {ERR_REASON(EC_R_DECODE_ERROR)           ,"decode error"},
 {ERR_REASON(EC_R_DISCRIMINANT_IS_ZERO)   ,"discriminant is zero"},
 {ERR_REASON(EC_R_EC_GROUP_NEW_BY_NAME_FAILURE),"ec group new by name failure"},
 {ERR_REASON(EC_R_FIELD_TOO_LARGE)        ,"field too large"},
+{ERR_REASON(EC_R_GF2M_NOT_SUPPORTED)     ,"gf2m not supported"},
 {ERR_REASON(EC_R_GROUP2PKPARAMETERS_FAILURE),"group2pkparameters failure"},
 {ERR_REASON(EC_R_I2D_ECPKPARAMETERS_FAILURE),"i2d ecpkparameters failure"},
 {ERR_REASON(EC_R_INCOMPATIBLE_OBJECTS)   ,"incompatible objects"},
@@ -239,6 +256,7 @@ static ERR_STRING_DATA EC_str_reasons[]=
 {ERR_REASON(EC_R_UNKNOWN_GROUP)          ,"unknown group"},
 {ERR_REASON(EC_R_UNKNOWN_ORDER)          ,"unknown order"},
 {ERR_REASON(EC_R_UNSUPPORTED_FIELD)      ,"unsupported field"},
+{ERR_REASON(EC_R_WRONG_CURVE_PARAMETERS) ,"wrong curve parameters"},
 {ERR_REASON(EC_R_WRONG_ORDER)            ,"wrong order"},
 {0,NULL}
 	};
diff --git a/src/lib/libcrypto/ec/ec_key.c b/src/lib/libcrypto/ec/ec_key.c
index 522802c07a..bf9fd2dc2c 100644
--- a/src/lib/libcrypto/ec/ec_key.c
+++ b/src/lib/libcrypto/ec/ec_key.c
@@ -64,7 +64,9 @@
 #include <string.h>
 #include "ec_lcl.h"
 #include <openssl/err.h>
-#include <string.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 EC_KEY *EC_KEY_new(void)
 	{
@@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void)
 		}
 
 	ret->version = 1;	
+	ret->flags = 0;
 	ret->group   = NULL;
 	ret->pub_key = NULL;
 	ret->priv_key= NULL;
@@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src)
 	dest->enc_flag  = src->enc_flag;
 	dest->conv_form = src->conv_form;
 	dest->version   = src->version;
+	dest->flags = src->flags;
 
 	return dest;
 	}
@@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey)
 	BIGNUM	*priv_key = NULL, *order = NULL;
 	EC_POINT *pub_key = NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_ec_key_generate_key(eckey);
+#endif
+
 	if (!eckey || !eckey->group)
 		{
 		ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER);
@@ -371,6 +380,82 @@ err:
 	return(ok);
 	}
 
+int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y)
+	{
+	BN_CTX *ctx = NULL;
+	BIGNUM *tx, *ty;
+	EC_POINT *point = NULL;
+	int ok = 0, tmp_nid, is_char_two = 0;
+
+	if (!key || !key->group || !x || !y)
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+						ERR_R_PASSED_NULL_PARAMETER);
+		return 0;
+		}
+	ctx = BN_CTX_new();
+	if (!ctx)
+		goto err;
+
+	point = EC_POINT_new(key->group);
+
+	if (!point)
+		goto err;
+
+	tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group));
+
+        if (tmp_nid == NID_X9_62_characteristic_two_field)
+		is_char_two = 1;
+
+	tx = BN_CTX_get(ctx);
+	ty = BN_CTX_get(ctx);
+#ifndef OPENSSL_NO_EC2M
+	if (is_char_two)
+		{
+		if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	else
+#endif
+		{
+		if (!EC_POINT_set_affine_coordinates_GFp(key->group, point,
+								x, y, ctx))
+			goto err;
+		if (!EC_POINT_get_affine_coordinates_GFp(key->group, point,
+								tx, ty, ctx))
+			goto err;
+		}
+	/* Check if retrieved coordinates match originals: if not values
+	 * are out of range.
+	 */
+	if (BN_cmp(x, tx) || BN_cmp(y, ty))
+		{
+		ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES,
+			EC_R_COORDINATES_OUT_OF_RANGE);
+		goto err;
+		}
+
+	if (!EC_KEY_set_public_key(key, point))
+		goto err;
+
+	if (EC_KEY_check_key(key) == 0)
+		goto err;
+
+	ok = 1;
+
+	err:
+	if (ctx)
+		BN_CTX_free(ctx);
+	if (point)
+		EC_POINT_free(point);
+	return ok;
+
+	}
+
 const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key)
 	{
 	return key->group;
@@ -461,3 +546,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx)
 		return 0;
 	return EC_GROUP_precompute_mult(key->group, ctx);
 	}
+
+int EC_KEY_get_flags(const EC_KEY *key)
+	{
+	return key->flags;
+	}
+
+void EC_KEY_set_flags(EC_KEY *key, int flags)
+	{
+	key->flags |= flags;
+	}
+
+void EC_KEY_clear_flags(EC_KEY *key, int flags)
+	{
+	key->flags &= ~flags;
+	}
diff --git a/src/lib/libcrypto/ec/ec_lcl.h b/src/lib/libcrypto/ec/ec_lcl.h
index 3e2c34b0bc..da7967df38 100644
--- a/src/lib/libcrypto/ec/ec_lcl.h
+++ b/src/lib/libcrypto/ec/ec_lcl.h
@@ -3,7 +3,7 @@
  * Originally written by Bodo Moeller for the OpenSSL project.
  */
 /* ====================================================================
- * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1998-2010 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -82,10 +82,15 @@
 # endif
 #endif
 
+/* Use default functions for poin2oct, oct2point and compressed coordinates */
+#define EC_FLAGS_DEFAULT_OCT	0x1
+
 /* Structure details are not part of the exported interface,
  * so all this may change in future versions. */
 
 struct ec_method_st {
+	/* Various method flags */
+	int flags;
 	/* used by EC_METHOD_get_field_type: */
 	int field_type; /* a NID */
 
@@ -244,6 +249,7 @@ struct ec_key_st {
 	point_conversion_form_t conv_form;
 
 	int 	references;
+	int	flags;
 
 	EC_EXTRA_DATA *method_data;
 } /* EC_KEY */;
@@ -391,3 +397,50 @@ int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
 	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
 int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
 int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ec2_mult.c */
+int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar,
+	size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GF2m_have_precompute_mult(const EC_GROUP *group);
+
+#ifndef OPENSSL_EC_NISTP_64_GCC_128
+/* method functions in ecp_nistp224.c */
+int ec_GFp_nistp224_group_init(EC_GROUP *group);
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp224_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp256.c */
+int ec_GFp_nistp256_group_init(EC_GROUP *group);
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp256_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group);
+
+/* method functions in ecp_nistp521.c */
+int ec_GFp_nistp521_group_init(EC_GROUP *group);
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *);
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx);
+int ec_GFp_nistp521_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *);
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx);
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx);
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group);
+
+/* utility functions in ecp_nistputil.c */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+	size_t felem_size, void *tmp_felems,
+	void (*felem_one)(void *out),
+	int (*felem_is_zero)(const void *in),
+	void (*felem_assign)(void *out, const void *in),
+	void (*felem_square)(void *out, const void *in),
+	void (*felem_mul)(void *out, const void *in1, const void *in2),
+	void (*felem_inv)(void *out, const void *in),
+	void (*felem_contract)(void *out, const void *in));
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in);
+#endif
diff --git a/src/lib/libcrypto/ec/ec_lib.c b/src/lib/libcrypto/ec/ec_lib.c
index dd7da0fcf9..25247b5803 100644
--- a/src/lib/libcrypto/ec/ec_lib.c
+++ b/src/lib/libcrypto/ec/ec_lib.c
@@ -425,7 +425,7 @@ int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
 	{
 	if (group->meth->group_set_curve == 0)
@@ -446,7 +446,7 @@ int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM
 		}
 	return group->meth->group_get_curve(group, p, a, b, ctx);
 	}
-
+#endif
 
 int EC_GROUP_get_degree(const EC_GROUP *group)
 	{
@@ -856,7 +856,7 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
 	const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx)
 	{
@@ -872,7 +872,7 @@ int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
 		}
 	return group->meth->point_set_affine_coordinates(group, point, x, y, ctx);
 	}
-
+#endif
 
 int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
@@ -890,7 +890,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *p
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
 
-
+#ifndef OPENSSL_NO_EC2M
 int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *point,
 	BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
 	{
@@ -906,75 +906,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *
 		}
 	return group->meth->point_get_affine_coordinates(group, point, x, y, ctx);
 	}
-
-
-int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x, int y_bit, BN_CTX *ctx)
-	{
-	if (group->meth->point_set_compressed_coordinates == 0)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
-	}
-
-
-size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-        unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->point2oct == 0)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->point2oct(group, point, form, buf, len, ctx);
-	}
-
-
-int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
-        const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	if (group->meth->oct2point == 0)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
-		return 0;
-		}
-	if (group->meth != point->meth)
-		{
-		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
-		return 0;
-		}
-	return group->meth->oct2point(group, point, buf, len, ctx);
-	}
-
+#endif
 
 int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{
diff --git a/src/lib/libcrypto/ec/ec_oct.c b/src/lib/libcrypto/ec/ec_oct.c
new file mode 100644
index 0000000000..fd9db0798d
--- /dev/null
+++ b/src/lib/libcrypto/ec/ec_oct.c
@@ -0,0 +1,199 @@
+/* crypto/ec/ec_lib.c */
+/*
+ * Originally written by Bodo Moeller for the OpenSSL project.
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2003 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Binary polynomial ECC support in OpenSSL originally developed by 
+ * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project.
+ */
+
+#include <string.h>
+
+#include <openssl/err.h>
+#include <openssl/opensslv.h>
+
+#include "ec_lcl.h"
+
+int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+#endif
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+
+#ifndef OPENSSL_NO_EC2M
+int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x, int y_bit, BN_CTX *ctx)
+	{
+	if (group->meth->point_set_compressed_coordinates == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		else
+			return ec_GF2m_simple_set_compressed_coordinates(
+					group, point, x, y_bit, ctx);
+		}
+	return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx);
+	}
+#endif
+
+size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+        unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->point2oct == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_point2oct(group, point,
+							form, buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_point2oct(group, point,
+							form, buf, len, ctx);
+#endif
+		}
+			
+	return group->meth->point2oct(group, point, form, buf, len, ctx);
+	}
+
+
+int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point,
+        const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	if (group->meth->oct2point == 0
+		&& !(group->meth->flags & EC_FLAGS_DEFAULT_OCT))
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+		return 0;
+		}
+	if (group->meth != point->meth)
+		{
+		ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS);
+		return 0;
+		}
+	if(group->meth->flags & EC_FLAGS_DEFAULT_OCT)
+		{
+		if (group->meth->field_type == NID_X9_62_prime_field)
+			return ec_GFp_simple_oct2point(group, point,
+							buf, len, ctx);
+		else
+#ifdef OPENSSL_NO_EC2M
+			{
+			ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED);
+			return 0;
+			}
+#else
+			return ec_GF2m_simple_oct2point(group, point,
+							buf, len, ctx);
+#endif
+		}
+	return group->meth->oct2point(group, point, buf, len, ctx);
+	}
+
diff --git a/src/lib/libcrypto/ec/ec_pmeth.c b/src/lib/libcrypto/ec/ec_pmeth.c
index f433076ca1..d1ed66c37e 100644
--- a/src/lib/libcrypto/ec/ec_pmeth.c
+++ b/src/lib/libcrypto/ec/ec_pmeth.c
@@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 
 		case EVP_PKEY_CTRL_MD:
 		if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 &&
+		    EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha224 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha256 &&
 		    EVP_MD_type((const EVP_MD *)p2) != NID_sha384 &&
diff --git a/src/lib/libcrypto/ec/eck_prn.c b/src/lib/libcrypto/ec/eck_prn.c
index 7d3e175ae7..06de8f3959 100644
--- a/src/lib/libcrypto/ec/eck_prn.c
+++ b/src/lib/libcrypto/ec/eck_prn.c
@@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
 			reason = ERR_R_MALLOC_FAILURE;
 			goto err;
 			}
-
+#ifndef OPENSSL_NO_EC2M
 		if (is_char_two)
 			{
 			if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx))
@@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off)
 				}
 			}
 		else /* prime field */
+#endif
 			{
 			if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx))
 				{
diff --git a/src/lib/libcrypto/ec/ecp_mont.c b/src/lib/libcrypto/ec/ecp_mont.c
index 9fc4a466a5..079e47431b 100644
--- a/src/lib/libcrypto/ec/ecp_mont.c
+++ b/src/lib/libcrypto/ec/ecp_mont.c
@@ -63,12 +63,20 @@
 
 #include <openssl/err.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 
 const EC_METHOD *EC_GFp_mont_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_mont_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_mont_group_init,
 		ec_GFp_mont_group_finish,
@@ -87,9 +95,7 @@ const EC_METHOD *EC_GFp_mont_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -108,7 +114,9 @@ const EC_METHOD *EC_GFp_mont_method(void)
 		ec_GFp_mont_field_decode,
 		ec_GFp_mont_field_set_to_one };
 
+
 	return &ret;
+#endif
 	}
 
 
diff --git a/src/lib/libcrypto/ec/ecp_nist.c b/src/lib/libcrypto/ec/ecp_nist.c
index 2a5682ea41..aad2d5f443 100644
--- a/src/lib/libcrypto/ec/ecp_nist.c
+++ b/src/lib/libcrypto/ec/ecp_nist.c
@@ -67,9 +67,17 @@
 #include <openssl/obj_mac.h>
 #include "ec_lcl.h"
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const EC_METHOD *EC_GFp_nist_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_nist_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_nist_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 int ec_GFp_nist_group_copy(EC_GROUP *dest, const EC_GROUP *src)
diff --git a/src/lib/libcrypto/ec/ecp_nistp224.c b/src/lib/libcrypto/ec/ecp_nistp224.c
new file mode 100644
index 0000000000..b5ff56c252
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistp224.c
@@ -0,0 +1,1658 @@
+/* crypto/ec/ecp_nistp224.c */
+/*
+ * Written by Emilia Kasper (Google) for the OpenSSL project.
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication
+ *
+ * Inspired by Daniel J. Bernstein's public domain nistp224 implementation
+ * and Adam Langley's public domain 64-bit C implementation of curve25519
+ */
+
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+
+#ifndef OPENSSL_SYS_VMS
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+
+#include <string.h>
+#include <openssl/err.h>
+#include "ec_lcl.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+  /* even with gcc, the typedef won't work for 32-bit platforms */
+  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
+#else
+  #error "Need GCC 3.1 or later to define type uint128_t"
+#endif
+
+typedef uint8_t u8;
+typedef uint64_t u64;
+typedef int64_t s64;
+
+
+/******************************************************************************/
+/*		    INTERNAL REPRESENTATION OF FIELD ELEMENTS
+ *
+ * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3
+ * using 64-bit coefficients called 'limbs',
+ * and sometimes (for multiplication results) as
+ * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6
+ * using 128-bit coefficients called 'widelimbs'.
+ * A 4-limb representation is an 'felem';
+ * a 7-widelimb representation is a 'widefelem'.
+ * Even within felems, bits of adjacent limbs overlap, and we don't always
+ * reduce the representations: we ensure that inputs to each felem
+ * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60,
+ * and fit into a 128-bit word without overflow. The coefficients are then
+ * again partially reduced to obtain an felem satisfying a_i < 2^57.
+ * We only reduce to the unique minimal representation at the end of the
+ * computation.
+ */
+
+typedef uint64_t limb;
+typedef uint128_t widelimb;
+
+typedef limb felem[4];
+typedef widelimb widefelem[7];
+
+/* Field element represented as a byte arrary.
+ * 28*8 = 224 bits is also the group order size for the elliptic curve,
+ * and we also use this type for scalars for point multiplication.
+  */
+typedef u8 felem_bytearray[28];
+
+static const felem_bytearray nistp224_curve_params[5] = {
+	{0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,    /* p */
+	 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,
+	 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01},
+	{0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,    /* a */
+	 0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF,
+	 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE},
+	{0xB4,0x05,0x0A,0x85,0x0C,0x04,0xB3,0xAB,0xF5,0x41,    /* b */
+	 0x32,0x56,0x50,0x44,0xB0,0xB7,0xD7,0xBF,0xD8,0xBA,
+	 0x27,0x0B,0x39,0x43,0x23,0x55,0xFF,0xB4},
+	{0xB7,0x0E,0x0C,0xBD,0x6B,0xB4,0xBF,0x7F,0x32,0x13,    /* x */
+	 0x90,0xB9,0x4A,0x03,0xC1,0xD3,0x56,0xC2,0x11,0x22,
+	 0x34,0x32,0x80,0xD6,0x11,0x5C,0x1D,0x21},
+	{0xbd,0x37,0x63,0x88,0xb5,0xf7,0x23,0xfb,0x4c,0x22,    /* y */
+	 0xdf,0xe6,0xcd,0x43,0x75,0xa0,0x5a,0x07,0x47,0x64,
+	 0x44,0xd5,0x81,0x99,0x85,0x00,0x7e,0x34}
+};
+
+/* Precomputed multiples of the standard generator
+ * Points are given in coordinates (X, Y, Z) where Z normally is 1
+ * (0 for the point at infinity).
+ * For each field element, slice a_0 is word 0, etc.
+ *
+ * The table has 2 * 16 elements, starting with the following:
+ * index | bits    | point
+ * ------+---------+------------------------------
+ *     0 | 0 0 0 0 | 0G
+ *     1 | 0 0 0 1 | 1G
+ *     2 | 0 0 1 0 | 2^56G
+ *     3 | 0 0 1 1 | (2^56 + 1)G
+ *     4 | 0 1 0 0 | 2^112G
+ *     5 | 0 1 0 1 | (2^112 + 1)G
+ *     6 | 0 1 1 0 | (2^112 + 2^56)G
+ *     7 | 0 1 1 1 | (2^112 + 2^56 + 1)G
+ *     8 | 1 0 0 0 | 2^168G
+ *     9 | 1 0 0 1 | (2^168 + 1)G
+ *    10 | 1 0 1 0 | (2^168 + 2^56)G
+ *    11 | 1 0 1 1 | (2^168 + 2^56 + 1)G
+ *    12 | 1 1 0 0 | (2^168 + 2^112)G
+ *    13 | 1 1 0 1 | (2^168 + 2^112 + 1)G
+ *    14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G
+ *    15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G
+ * followed by a copy of this with each element multiplied by 2^28.
+ *
+ * The reason for this is so that we can clock bits into four different
+ * locations when doing simple scalar multiplies against the base point,
+ * and then another four locations using the second 16 elements.
+ */
+static const felem gmul[2][16][3] =
+{{{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf},
+   {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723},
+   {1, 0, 0, 0}},
+  {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5},
+   {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321},
+   {1, 0, 0, 0}},
+  {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748},
+   {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17},
+   {1, 0, 0, 0}},
+  {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe},
+   {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b},
+   {1, 0, 0, 0}},
+  {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3},
+   {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a},
+   {1, 0, 0, 0}},
+  {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c},
+   {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244},
+   {1, 0, 0, 0}},
+  {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849},
+   {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112},
+   {1, 0, 0, 0}},
+  {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47},
+   {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394},
+   {1, 0, 0, 0}},
+  {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d},
+   {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7},
+   {1, 0, 0, 0}},
+  {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24},
+   {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881},
+   {1, 0, 0, 0}},
+  {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984},
+   {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369},
+   {1, 0, 0, 0}},
+  {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3},
+   {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60},
+   {1, 0, 0, 0}},
+  {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057},
+   {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9},
+   {1, 0, 0, 0}},
+  {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9},
+   {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc},
+   {1, 0, 0, 0}},
+  {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58},
+   {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558},
+   {1, 0, 0, 0}}},
+ {{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31},
+   {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d},
+   {1, 0, 0, 0}},
+  {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3},
+   {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a},
+   {1, 0, 0, 0}},
+  {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33},
+   {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100},
+   {1, 0, 0, 0}},
+  {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5},
+   {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea},
+   {1, 0, 0, 0}},
+  {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be},
+   {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51},
+   {1, 0, 0, 0}},
+  {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1},
+   {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb},
+   {1, 0, 0, 0}},
+  {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233},
+   {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def},
+   {1, 0, 0, 0}},
+  {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae},
+   {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45},
+   {1, 0, 0, 0}},
+  {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e},
+   {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb},
+   {1, 0, 0, 0}},
+  {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de},
+   {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3},
+   {1, 0, 0, 0}},
+  {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05},
+   {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58},
+   {1, 0, 0, 0}},
+  {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb},
+   {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0},
+   {1, 0, 0, 0}},
+  {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9},
+   {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea},
+   {1, 0, 0, 0}},
+  {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba},
+   {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405},
+   {1, 0, 0, 0}},
+  {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e},
+   {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e},
+   {1, 0, 0, 0}}}};
+
+/* Precomputation for the group generator. */
+typedef struct {
+	felem g_pre_comp[2][16][3];
+	int references;
+} NISTP224_PRE_COMP;
+
+const EC_METHOD *EC_GFp_nistp224_method(void)
+	{
+	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
+		NID_X9_62_prime_field,
+		ec_GFp_nistp224_group_init,
+		ec_GFp_simple_group_finish,
+		ec_GFp_simple_group_clear_finish,
+		ec_GFp_nist_group_copy,
+		ec_GFp_nistp224_group_set_curve,
+		ec_GFp_simple_group_get_curve,
+		ec_GFp_simple_group_get_degree,
+		ec_GFp_simple_group_check_discriminant,
+		ec_GFp_simple_point_init,
+		ec_GFp_simple_point_finish,
+		ec_GFp_simple_point_clear_finish,
+		ec_GFp_simple_point_copy,
+		ec_GFp_simple_point_set_to_infinity,
+		ec_GFp_simple_set_Jprojective_coordinates_GFp,
+		ec_GFp_simple_get_Jprojective_coordinates_GFp,
+		ec_GFp_simple_point_set_affine_coordinates,
+		ec_GFp_nistp224_point_get_affine_coordinates,
+		0 /* point_set_compressed_coordinates */,
+		0 /* point2oct */,
+		0 /* oct2point */,
+		ec_GFp_simple_add,
+		ec_GFp_simple_dbl,
+		ec_GFp_simple_invert,
+		ec_GFp_simple_is_at_infinity,
+		ec_GFp_simple_is_on_curve,
+		ec_GFp_simple_cmp,
+		ec_GFp_simple_make_affine,
+		ec_GFp_simple_points_make_affine,
+		ec_GFp_nistp224_points_mul,
+		ec_GFp_nistp224_precompute_mult,
+		ec_GFp_nistp224_have_precompute_mult,
+		ec_GFp_nist_field_mul,
+		ec_GFp_nist_field_sqr,
+		0 /* field_div */,
+		0 /* field_encode */,
+		0 /* field_decode */,
+		0 /* field_set_to_one */ };
+
+	return &ret;
+	}
+
+/* Helper functions to convert field elements to/from internal representation */
+static void bin28_to_felem(felem out, const u8 in[28])
+	{
+	out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff;
+	out[1] = (*((const uint64_t *)(in+7))) & 0x00ffffffffffffff;
+	out[2] = (*((const uint64_t *)(in+14))) & 0x00ffffffffffffff;
+	out[3] = (*((const uint64_t *)(in+21))) & 0x00ffffffffffffff;
+	}
+
+static void felem_to_bin28(u8 out[28], const felem in)
+	{
+	unsigned i;
+	for (i = 0; i < 7; ++i)
+		{
+		out[i]	  = in[0]>>(8*i);
+		out[i+7]  = in[1]>>(8*i);
+		out[i+14] = in[2]>>(8*i);
+		out[i+21] = in[3]>>(8*i);
+		}
+	}
+
+/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
+static void flip_endian(u8 *out, const u8 *in, unsigned len)
+	{
+	unsigned i;
+	for (i = 0; i < len; ++i)
+		out[i] = in[len-1-i];
+	}
+
+/* From OpenSSL BIGNUM to internal representation */
+static int BN_to_felem(felem out, const BIGNUM *bn)
+	{
+	felem_bytearray b_in;
+	felem_bytearray b_out;
+	unsigned num_bytes;
+
+	/* BN_bn2bin eats leading zeroes */
+	memset(b_out, 0, sizeof b_out);
+	num_bytes = BN_num_bytes(bn);
+	if (num_bytes > sizeof b_out)
+		{
+		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+		return 0;
+		}
+	if (BN_is_negative(bn))
+		{
+		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+		return 0;
+		}
+	num_bytes = BN_bn2bin(bn, b_in);
+	flip_endian(b_out, b_in, num_bytes);
+	bin28_to_felem(out, b_out);
+	return 1;
+	}
+
+/* From internal representation to OpenSSL BIGNUM */
+static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
+	{
+	felem_bytearray b_in, b_out;
+	felem_to_bin28(b_in, in);
+	flip_endian(b_out, b_in, sizeof b_out);
+	return BN_bin2bn(b_out, sizeof b_out, out);
+	}
+
+/******************************************************************************/
+/*				FIELD OPERATIONS
+ *
+ * Field operations, using the internal representation of field elements.
+ * NB! These operations are specific to our point multiplication and cannot be
+ * expected to be correct in general - e.g., multiplication with a large scalar
+ * will cause an overflow.
+ *
+ */
+
+static void felem_one(felem out)
+	{
+	out[0] = 1;
+	out[1] = 0;
+	out[2] = 0;
+	out[3] = 0;
+	}
+
+static void felem_assign(felem out, const felem in)
+	{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+	}
+
+/* Sum two field elements: out += in */
+static void felem_sum(felem out, const felem in)
+	{
+	out[0] += in[0];
+	out[1] += in[1];
+	out[2] += in[2];
+	out[3] += in[3];
+	}
+
+/* Get negative value: out = -in */
+/* Assumes in[i] < 2^57 */
+static void felem_neg(felem out, const felem in)
+	{
+	static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
+	static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
+	static const limb two58m42m2 = (((limb) 1) << 58) -
+	    (((limb) 1) << 42) - (((limb) 1) << 2);
+
+	/* Set to 0 mod 2^224-2^96+1 to ensure out > in */
+	out[0] = two58p2 - in[0];
+	out[1] = two58m42m2 - in[1];
+	out[2] = two58m2 - in[2];
+	out[3] = two58m2 - in[3];
+	}
+
+/* Subtract field elements: out -= in */
+/* Assumes in[i] < 2^57 */
+static void felem_diff(felem out, const felem in)
+	{
+	static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2);
+	static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2);
+	static const limb two58m42m2 = (((limb) 1) << 58) -
+	    (((limb) 1) << 42) - (((limb) 1) << 2);
+
+	/* Add 0 mod 2^224-2^96+1 to ensure out > in */
+	out[0] += two58p2;
+	out[1] += two58m42m2;
+	out[2] += two58m2;
+	out[3] += two58m2;
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+	}
+
+/* Subtract in unreduced 128-bit mode: out -= in */
+/* Assumes in[i] < 2^119 */
+static void widefelem_diff(widefelem out, const widefelem in)
+	{
+	static const widelimb two120 = ((widelimb) 1) << 120;
+	static const widelimb two120m64 = (((widelimb) 1) << 120) -
+		(((widelimb) 1) << 64);
+	static const widelimb two120m104m64 = (((widelimb) 1) << 120) -
+		(((widelimb) 1) << 104) - (((widelimb) 1) << 64);
+
+	/* Add 0 mod 2^224-2^96+1 to ensure out > in */
+	out[0] += two120;
+	out[1] += two120m64;
+	out[2] += two120m64;
+	out[3] += two120;
+	out[4] += two120m104m64;
+	out[5] += two120m64;
+	out[6] += two120m64;
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+	out[4] -= in[4];
+	out[5] -= in[5];
+	out[6] -= in[6];
+	}
+
+/* Subtract in mixed mode: out128 -= in64 */
+/* in[i] < 2^63 */
+static void felem_diff_128_64(widefelem out, const felem in)
+	{
+	static const widelimb two64p8 = (((widelimb) 1) << 64) +
+		(((widelimb) 1) << 8);
+	static const widelimb two64m8 = (((widelimb) 1) << 64) -
+		(((widelimb) 1) << 8);
+	static const widelimb two64m48m8 = (((widelimb) 1) << 64) -
+		(((widelimb) 1) << 48) - (((widelimb) 1) << 8);
+
+	/* Add 0 mod 2^224-2^96+1 to ensure out > in */
+	out[0] += two64p8;
+	out[1] += two64m48m8;
+	out[2] += two64m8;
+	out[3] += two64m8;
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+	}
+
+/* Multiply a field element by a scalar: out = out * scalar
+ * The scalars we actually use are small, so results fit without overflow */
+static void felem_scalar(felem out, const limb scalar)
+	{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+	}
+
+/* Multiply an unreduced field element by a scalar: out = out * scalar
+ * The scalars we actually use are small, so results fit without overflow */
+static void widefelem_scalar(widefelem out, const widelimb scalar)
+	{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+	out[4] *= scalar;
+	out[5] *= scalar;
+	out[6] *= scalar;
+	}
+
+/* Square a field element: out = in^2 */
+static void felem_square(widefelem out, const felem in)
+	{
+	limb tmp0, tmp1, tmp2;
+	tmp0 = 2 * in[0]; tmp1 = 2 * in[1]; tmp2 = 2 * in[2];
+	out[0] = ((widelimb) in[0]) * in[0];
+	out[1] = ((widelimb) in[0]) * tmp1;
+	out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1];
+	out[3] = ((widelimb) in[3]) * tmp0 +
+		((widelimb) in[1]) * tmp2;
+	out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2];
+	out[5] = ((widelimb) in[3]) * tmp2;
+	out[6] = ((widelimb) in[3]) * in[3];
+	}
+
+/* Multiply two field elements: out = in1 * in2 */
+static void felem_mul(widefelem out, const felem in1, const felem in2)
+	{
+	out[0] = ((widelimb) in1[0]) * in2[0];
+	out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0];
+	out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] +
+		((widelimb) in1[2]) * in2[0];
+	out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] +
+		((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0];
+	out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] +
+		((widelimb) in1[3]) * in2[1];
+	out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2];
+	out[6] = ((widelimb) in1[3]) * in2[3];
+	}
+
+/* Reduce seven 128-bit coefficients to four 64-bit coefficients.
+ * Requires in[i] < 2^126,
+ * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */
+static void felem_reduce(felem out, const widefelem in)
+	{
+	static const widelimb two127p15 = (((widelimb) 1) << 127) +
+		(((widelimb) 1) << 15);
+	static const widelimb two127m71 = (((widelimb) 1) << 127) -
+		(((widelimb) 1) << 71);
+	static const widelimb two127m71m55 = (((widelimb) 1) << 127) -
+		(((widelimb) 1) << 71) - (((widelimb) 1) << 55);
+	widelimb output[5];
+
+	/* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */
+	output[0] = in[0] + two127p15;
+	output[1] = in[1] + two127m71m55;
+	output[2] = in[2] + two127m71;
+	output[3] = in[3];
+	output[4] = in[4];
+
+	/* Eliminate in[4], in[5], in[6] */
+	output[4] += in[6] >> 16;
+	output[3] += (in[6] & 0xffff) << 40;
+	output[2] -= in[6];
+
+	output[3] += in[5] >> 16;
+	output[2] += (in[5] & 0xffff) << 40;
+	output[1] -= in[5];
+
+	output[2] += output[4] >> 16;
+	output[1] += (output[4] & 0xffff) << 40;
+	output[0] -= output[4];
+
+	/* Carry 2 -> 3 -> 4 */
+	output[3] += output[2] >> 56;
+	output[2] &= 0x00ffffffffffffff;
+
+	output[4] = output[3] >> 56;
+	output[3] &= 0x00ffffffffffffff;
+
+	/* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */
+
+	/* Eliminate output[4] */
+	output[2] += output[4] >> 16;
+	/* output[2] < 2^56 + 2^56 = 2^57 */
+	output[1] += (output[4] & 0xffff) << 40;
+	output[0] -= output[4];
+
+	/* Carry 0 -> 1 -> 2 -> 3 */
+	output[1] += output[0] >> 56;
+	out[0] = output[0] & 0x00ffffffffffffff;
+
+	output[2] += output[1] >> 56;
+	/* output[2] < 2^57 + 2^72 */
+	out[1] = output[1] & 0x00ffffffffffffff;
+	output[3] += output[2] >> 56;
+	/* output[3] <= 2^56 + 2^16 */
+	out[2] = output[2] & 0x00ffffffffffffff;
+
+	/* out[0] < 2^56, out[1] < 2^56, out[2] < 2^56,
+	 * out[3] <= 2^56 + 2^16 (due to final carry),
+	 * so out < 2*p */
+	out[3] = output[3];
+	}
+
+static void felem_square_reduce(felem out, const felem in)
+	{
+	widefelem tmp;
+	felem_square(tmp, in);
+	felem_reduce(out, tmp);
+	}
+
+static void felem_mul_reduce(felem out, const felem in1, const felem in2)
+	{
+	widefelem tmp;
+	felem_mul(tmp, in1, in2);
+	felem_reduce(out, tmp);
+	}
+
+/* Reduce to unique minimal representation.
+ * Requires 0 <= in < 2*p (always call felem_reduce first) */
+static void felem_contract(felem out, const felem in)
+	{
+	static const int64_t two56 = ((limb) 1) << 56;
+	/* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */
+	/* if in > p , reduce in = in - 2^224 + 2^96 - 1 */
+	int64_t tmp[4], a;
+	tmp[0] = in[0];
+	tmp[1] = in[1];
+	tmp[2] = in[2];
+	tmp[3] = in[3];
+	/* Case 1: a = 1 iff in >= 2^224 */
+	a = (in[3] >> 56);
+	tmp[0] -= a;
+	tmp[1] += a << 40;
+	tmp[3] &= 0x00ffffffffffffff;
+	/* Case 2: a = 0 iff p <= in < 2^224, i.e.,
+	 * the high 128 bits are all 1 and the lower part is non-zero */
+	a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) |
+		(((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63);
+	a &= 0x00ffffffffffffff;
+	/* turn a into an all-one mask (if a = 0) or an all-zero mask */
+	a = (a - 1) >> 63;
+	/* subtract 2^224 - 2^96 + 1 if a is all-one*/
+	tmp[3] &= a ^ 0xffffffffffffffff;
+	tmp[2] &= a ^ 0xffffffffffffffff;
+	tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff;
+	tmp[0] -= 1 & a;
+
+	/* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must
+	 * be non-zero, so we only need one step */
+	a = tmp[0] >> 63;
+	tmp[0] += two56 & a;
+	tmp[1] -= 1 & a;
+
+	/* carry 1 -> 2 -> 3 */
+	tmp[2] += tmp[1] >> 56;
+	tmp[1] &= 0x00ffffffffffffff;
+
+	tmp[3] += tmp[2] >> 56;
+	tmp[2] &= 0x00ffffffffffffff;
+
+	/* Now 0 <= out < p */
+	out[0] = tmp[0];
+	out[1] = tmp[1];
+	out[2] = tmp[2];
+	out[3] = tmp[3];
+	}
+
+/* Zero-check: returns 1 if input is 0, and 0 otherwise.
+ * We know that field elements are reduced to in < 2^225,
+ * so we only need to check three cases: 0, 2^224 - 2^96 + 1,
+ * and 2^225 - 2^97 + 2 */
+static limb felem_is_zero(const felem in)
+	{
+	limb zero, two224m96p1, two225m97p2;
+
+	zero = in[0] | in[1] | in[2] | in[3];
+	zero = (((int64_t)(zero) - 1) >> 63) & 1;
+	two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000)
+		| (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff);
+	two224m96p1 = (((int64_t)(two224m96p1) - 1) >> 63) & 1;
+	two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000)
+		| (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff);
+	two225m97p2 = (((int64_t)(two225m97p2) - 1) >> 63) & 1;
+	return (zero | two224m96p1 | two225m97p2);
+	}
+
+static limb felem_is_zero_int(const felem in)
+	{
+	return (int) (felem_is_zero(in) & ((limb)1));
+	}
+
+/* Invert a field element */
+/* Computation chain copied from djb's code */
+static void felem_inv(felem out, const felem in)
+	{
+	felem ftmp, ftmp2, ftmp3, ftmp4;
+	widefelem tmp;
+	unsigned i;
+
+	felem_square(tmp, in); felem_reduce(ftmp, tmp);		/* 2 */
+	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^2 - 1 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 2 */
+	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 1 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp);	/* 2^4 - 2 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);	/* 2^5 - 4 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);	/* 2^6 - 8 */
+	felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp);	/* 2^6 - 1 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp);	/* 2^7 - 2 */
+	for (i = 0; i < 5; ++i)					/* 2^12 - 2^6 */
+		{
+		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+		}
+	felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp2, tmp);	/* 2^12 - 1 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^13 - 2 */
+	for (i = 0; i < 11; ++i)				/* 2^24 - 2^12 */
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp2, tmp); /* 2^24 - 1 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^25 - 2 */
+	for (i = 0; i < 23; ++i)				/* 2^48 - 2^24 */
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^48 - 1 */
+	felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp);	/* 2^49 - 2 */
+	for (i = 0; i < 47; ++i)				/* 2^96 - 2^48 */
+		{
+		felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);
+		}
+	felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^96 - 1 */
+	felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp);	/* 2^97 - 2 */
+	for (i = 0; i < 23; ++i)				/* 2^120 - 2^24 */
+		{
+		felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);
+		}
+	felem_mul(tmp, ftmp2, ftmp4); felem_reduce(ftmp2, tmp); /* 2^120 - 1 */
+	for (i = 0; i < 6; ++i)					/* 2^126 - 2^6 */
+		{
+		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+		}
+	felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp);	/* 2^126 - 1 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^127 - 2 */
+	felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp);	/* 2^127 - 1 */
+	for (i = 0; i < 97; ++i)				/* 2^224 - 2^97 */
+		{
+		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+		}
+	felem_mul(tmp, ftmp, ftmp3); felem_reduce(out, tmp);	/* 2^224 - 2^96 - 1 */
+	}
+
+/* Copy in constant time:
+ * if icopy == 1, copy in to out,
+ * if icopy == 0, copy out to itself. */
+static void
+copy_conditional(felem out, const felem in, limb icopy)
+	{
+	unsigned i;
+	/* icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one */
+	const limb copy = -icopy;
+	for (i = 0; i < 4; ++i)
+		{
+		const limb tmp = copy & (in[i] ^ out[i]);
+		out[i] ^= tmp;
+		}
+	}
+
+/******************************************************************************/
+/*			 ELLIPTIC CURVE POINT OPERATIONS
+ *
+ * Points are represented in Jacobian projective coordinates:
+ * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3),
+ * or to the point at infinity if Z == 0.
+ *
+ */
+
+/* Double an elliptic curve point:
+ * (X', Y', Z') = 2 * (X, Y, Z), where
+ * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2
+ * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2
+ * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z
+ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed,
+ * while x_out == y_in is not (maybe this works, but it's not tested). */
+static void
+point_double(felem x_out, felem y_out, felem z_out,
+             const felem x_in, const felem y_in, const felem z_in)
+	{
+	widefelem tmp, tmp2;
+	felem delta, gamma, beta, alpha, ftmp, ftmp2;
+
+	felem_assign(ftmp, x_in);
+	felem_assign(ftmp2, x_in);
+
+	/* delta = z^2 */
+	felem_square(tmp, z_in);
+	felem_reduce(delta, tmp);
+
+	/* gamma = y^2 */
+	felem_square(tmp, y_in);
+	felem_reduce(gamma, tmp);
+
+	/* beta = x*gamma */
+	felem_mul(tmp, x_in, gamma);
+	felem_reduce(beta, tmp);
+
+	/* alpha = 3*(x-delta)*(x+delta) */
+	felem_diff(ftmp, delta);
+	/* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */
+	felem_sum(ftmp2, delta);
+	/* ftmp2[i] < 2^57 + 2^57 = 2^58 */
+	felem_scalar(ftmp2, 3);
+	/* ftmp2[i] < 3 * 2^58 < 2^60 */
+	felem_mul(tmp, ftmp, ftmp2);
+	/* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */
+	felem_reduce(alpha, tmp);
+
+	/* x' = alpha^2 - 8*beta */
+	felem_square(tmp, alpha);
+	/* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+	felem_assign(ftmp, beta);
+	felem_scalar(ftmp, 8);
+	/* ftmp[i] < 8 * 2^57 = 2^60 */
+	felem_diff_128_64(tmp, ftmp);
+	/* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+	felem_reduce(x_out, tmp);
+
+	/* z' = (y + z)^2 - gamma - delta */
+	felem_sum(delta, gamma);
+	/* delta[i] < 2^57 + 2^57 = 2^58 */
+	felem_assign(ftmp, y_in);
+	felem_sum(ftmp, z_in);
+	/* ftmp[i] < 2^57 + 2^57 = 2^58 */
+	felem_square(tmp, ftmp);
+	/* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */
+	felem_diff_128_64(tmp, delta);
+	/* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */
+	felem_reduce(z_out, tmp);
+
+	/* y' = alpha*(4*beta - x') - 8*gamma^2 */
+	felem_scalar(beta, 4);
+	/* beta[i] < 4 * 2^57 = 2^59 */
+	felem_diff(beta, x_out);
+	/* beta[i] < 2^59 + 2^58 + 2 < 2^60 */
+	felem_mul(tmp, alpha, beta);
+	/* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */
+	felem_square(tmp2, gamma);
+	/* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */
+	widefelem_scalar(tmp2, 8);
+	/* tmp2[i] < 8 * 2^116 = 2^119 */
+	widefelem_diff(tmp, tmp2);
+	/* tmp[i] < 2^119 + 2^120 < 2^121 */
+	felem_reduce(y_out, tmp);
+	}
+
+/* Add two elliptic curve points:
+ * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where
+ * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -
+ * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2
+ * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) -
+ *        Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3
+ * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2)
+ *
+ * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0.
+ */
+
+/* This function is not entirely constant-time:
+ * it includes a branch for checking whether the two input points are equal,
+ * (while not equal to the point at infinity).
+ * This case never happens during single point multiplication,
+ * so there is no timing leak for ECDH or ECDSA signing. */
+static void point_add(felem x3, felem y3, felem z3,
+	const felem x1, const felem y1, const felem z1,
+	const int mixed, const felem x2, const felem y2, const felem z2)
+	{
+	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out;
+	widefelem tmp, tmp2;
+	limb z1_is_zero, z2_is_zero, x_equal, y_equal;
+
+	if (!mixed)
+		{
+		/* ftmp2 = z2^2 */
+		felem_square(tmp, z2);
+		felem_reduce(ftmp2, tmp);
+
+		/* ftmp4 = z2^3 */
+		felem_mul(tmp, ftmp2, z2);
+		felem_reduce(ftmp4, tmp);
+
+		/* ftmp4 = z2^3*y1 */
+		felem_mul(tmp2, ftmp4, y1);
+		felem_reduce(ftmp4, tmp2);
+
+		/* ftmp2 = z2^2*x1 */
+		felem_mul(tmp2, ftmp2, x1);
+		felem_reduce(ftmp2, tmp2);
+		}
+	else
+		{
+		/* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
+
+		/* ftmp4 = z2^3*y1 */
+		felem_assign(ftmp4, y1);
+
+		/* ftmp2 = z2^2*x1 */
+		felem_assign(ftmp2, x1);
+		}
+
+	/* ftmp = z1^2 */
+	felem_square(tmp, z1);
+	felem_reduce(ftmp, tmp);
+
+	/* ftmp3 = z1^3 */
+	felem_mul(tmp, ftmp, z1);
+	felem_reduce(ftmp3, tmp);
+
+	/* tmp = z1^3*y2 */
+	felem_mul(tmp, ftmp3, y2);
+	/* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+
+	/* ftmp3 = z1^3*y2 - z2^3*y1 */
+	felem_diff_128_64(tmp, ftmp4);
+	/* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+	felem_reduce(ftmp3, tmp);
+
+	/* tmp = z1^2*x2 */
+	felem_mul(tmp, ftmp, x2);
+	/* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+
+	/* ftmp = z1^2*x2 - z2^2*x1 */
+	felem_diff_128_64(tmp, ftmp2);
+	/* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */
+	felem_reduce(ftmp, tmp);
+
+	/* the formulae are incorrect if the points are equal
+	 * so we check for this and do doubling if this happens */
+	x_equal = felem_is_zero(ftmp);
+	y_equal = felem_is_zero(ftmp3);
+	z1_is_zero = felem_is_zero(z1);
+	z2_is_zero = felem_is_zero(z2);
+	/* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */
+	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
+		{
+		point_double(x3, y3, z3, x1, y1, z1);
+		return;
+		}
+
+	/* ftmp5 = z1*z2 */
+	if (!mixed)
+		{
+		felem_mul(tmp, z1, z2);
+		felem_reduce(ftmp5, tmp);
+		}
+	else
+		{
+		/* special case z2 = 0 is handled later */
+		felem_assign(ftmp5, z1);
+		}
+
+	/* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */
+	felem_mul(tmp, ftmp, ftmp5);
+	felem_reduce(z_out, tmp);
+
+	/* ftmp = (z1^2*x2 - z2^2*x1)^2 */
+	felem_assign(ftmp5, ftmp);
+	felem_square(tmp, ftmp);
+	felem_reduce(ftmp, tmp);
+
+	/* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */
+	felem_mul(tmp, ftmp, ftmp5);
+	felem_reduce(ftmp5, tmp);
+
+	/* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+	felem_mul(tmp, ftmp2, ftmp);
+	felem_reduce(ftmp2, tmp);
+
+	/* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
+	felem_mul(tmp, ftmp4, ftmp5);
+	/* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */
+
+	/* tmp2 = (z1^3*y2 - z2^3*y1)^2 */
+	felem_square(tmp2, ftmp3);
+	/* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */
+
+	/* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */
+	felem_diff_128_64(tmp2, ftmp5);
+	/* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */
+
+	/* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+	felem_assign(ftmp5, ftmp2);
+	felem_scalar(ftmp5, 2);
+	/* ftmp5[i] < 2 * 2^57 = 2^58 */
+
+	/* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 -
+	   2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */
+	felem_diff_128_64(tmp2, ftmp5);
+	/* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */
+	felem_reduce(x_out, tmp2);
+
+	/* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */
+	felem_diff(ftmp2, x_out);
+	/* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */
+
+	/* tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) */
+	felem_mul(tmp2, ftmp3, ftmp2);
+	/* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */
+
+	/* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) -
+	   z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */
+	widefelem_diff(tmp2, tmp);
+	/* tmp2[i] < 2^118 + 2^120 < 2^121 */
+	felem_reduce(y_out, tmp2);
+
+	/* the result (x_out, y_out, z_out) is incorrect if one of the inputs is
+	 * the point at infinity, so we need to check for this separately */
+
+	/* if point 1 is at infinity, copy point 2 to output, and vice versa */
+	copy_conditional(x_out, x2, z1_is_zero);
+	copy_conditional(x_out, x1, z2_is_zero);
+	copy_conditional(y_out, y2, z1_is_zero);
+	copy_conditional(y_out, y1, z2_is_zero);
+	copy_conditional(z_out, z2, z1_is_zero);
+	copy_conditional(z_out, z1, z2_is_zero);
+	felem_assign(x3, x_out);
+	felem_assign(y3, y_out);
+	felem_assign(z3, z_out);
+	}
+
+/* select_point selects the |idx|th point from a precomputation table and
+ * copies it to out. */
+static void select_point(const u64 idx, unsigned int size, const felem pre_comp[/*size*/][3], felem out[3])
+	{
+	unsigned i, j;
+	limb *outlimbs = &out[0][0];
+	memset(outlimbs, 0, 3 * sizeof(felem));
+
+	for (i = 0; i < size; i++)
+		{
+		const limb *inlimbs = &pre_comp[i][0][0];
+		u64 mask = i ^ idx;
+		mask |= mask >> 4;
+		mask |= mask >> 2;
+		mask |= mask >> 1;
+		mask &= 1;
+		mask--;
+		for (j = 0; j < 4 * 3; j++)
+			outlimbs[j] |= inlimbs[j] & mask;
+		}
+	}
+
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, unsigned i)
+	{
+	if (i >= 224)
+		return 0;
+	return (in[i >> 3] >> (i & 7)) & 1;
+	}
+
+/* Interleaved point multiplication using precomputed point multiples:
+ * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
+ * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
+ * of the generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
+static void batch_mul(felem x_out, felem y_out, felem z_out,
+	const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
+	const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[2][16][3])
+	{
+	int i, skip;
+	unsigned num;
+	unsigned gen_mul = (g_scalar != NULL);
+	felem nq[3], tmp[4];
+	u64 bits;
+	u8 sign, digit;
+
+	/* set nq to the point at infinity */
+	memset(nq, 0, 3 * sizeof(felem));
+
+	/* Loop over all scalars msb-to-lsb, interleaving additions
+	 * of multiples of the generator (two in each of the last 28 rounds)
+	 * and additions of other points multiples (every 5th round).
+	 */
+	skip = 1; /* save two point operations in the first round */
+	for (i = (num_points ? 220 : 27); i >= 0; --i)
+		{
+		/* double */
+		if (!skip)
+			point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+
+		/* add multiples of the generator */
+		if (gen_mul && (i <= 27))
+			{
+			/* first, look 28 bits upwards */
+			bits = get_bit(g_scalar, i + 196) << 3;
+			bits |= get_bit(g_scalar, i + 140) << 2;
+			bits |= get_bit(g_scalar, i + 84) << 1;
+			bits |= get_bit(g_scalar, i + 28);
+			/* select the point to add, in constant time */
+			select_point(bits, 16, g_pre_comp[1], tmp);
+
+			if (!skip)
+				{
+				point_add(nq[0], nq[1], nq[2],
+					nq[0], nq[1], nq[2],
+					1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+				}
+			else
+				{
+				memcpy(nq, tmp, 3 * sizeof(felem));
+				skip = 0;
+				}
+
+			/* second, look at the current position */
+			bits = get_bit(g_scalar, i + 168) << 3;
+			bits |= get_bit(g_scalar, i + 112) << 2;
+			bits |= get_bit(g_scalar, i + 56) << 1;
+			bits |= get_bit(g_scalar, i);
+			/* select the point to add, in constant time */
+			select_point(bits, 16, g_pre_comp[0], tmp);
+			point_add(nq[0], nq[1], nq[2],
+				nq[0], nq[1], nq[2],
+				1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+			}
+
+		/* do other additions every 5 doublings */
+		if (num_points && (i % 5 == 0))
+			{
+			/* loop over all scalars */
+			for (num = 0; num < num_points; ++num)
+				{
+				bits = get_bit(scalars[num], i + 4) << 5;
+				bits |= get_bit(scalars[num], i + 3) << 4;
+				bits |= get_bit(scalars[num], i + 2) << 3;
+				bits |= get_bit(scalars[num], i + 1) << 2;
+				bits |= get_bit(scalars[num], i) << 1;
+				bits |= get_bit(scalars[num], i - 1);
+				ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+
+				/* select the point to add or subtract */
+				select_point(digit, 17, pre_comp[num], tmp);
+				felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
+				copy_conditional(tmp[1], tmp[3], sign);
+
+				if (!skip)
+					{
+					point_add(nq[0], nq[1], nq[2],
+						nq[0], nq[1], nq[2],
+						mixed, tmp[0], tmp[1], tmp[2]);
+					}
+				else
+					{
+					memcpy(nq, tmp, 3 * sizeof(felem));
+					skip = 0;
+					}
+				}
+			}
+		}
+	felem_assign(x_out, nq[0]);
+	felem_assign(y_out, nq[1]);
+	felem_assign(z_out, nq[2]);
+	}
+
+/******************************************************************************/
+/*		       FUNCTIONS TO MANAGE PRECOMPUTATION
+ */
+
+static NISTP224_PRE_COMP *nistp224_pre_comp_new()
+	{
+	NISTP224_PRE_COMP *ret = NULL;
+	ret = (NISTP224_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
+	if (!ret)
+		{
+		ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+		return ret;
+		}
+	memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
+	ret->references = 1;
+	return ret;
+	}
+
+static void *nistp224_pre_comp_dup(void *src_)
+	{
+	NISTP224_PRE_COMP *src = src_;
+
+	/* no need to actually copy, these objects never change! */
+	CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+
+	return src_;
+	}
+
+static void nistp224_pre_comp_free(void *pre_)
+	{
+	int i;
+	NISTP224_PRE_COMP *pre = pre_;
+
+	if (!pre)
+		return;
+
+	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+	if (i > 0)
+		return;
+
+	OPENSSL_free(pre);
+	}
+
+static void nistp224_pre_comp_clear_free(void *pre_)
+	{
+	int i;
+	NISTP224_PRE_COMP *pre = pre_;
+
+	if (!pre)
+		return;
+
+	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+	if (i > 0)
+		return;
+
+	OPENSSL_cleanse(pre, sizeof *pre);
+	OPENSSL_free(pre);
+	}
+
+/******************************************************************************/
+/*			   OPENSSL EC_METHOD FUNCTIONS
+ */
+
+int ec_GFp_nistp224_group_init(EC_GROUP *group)
+	{
+	int ret;
+	ret = ec_GFp_simple_group_init(group);
+	group->a_is_minus3 = 1;
+	return ret;
+	}
+
+int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+	const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
+	{
+	int ret = 0;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *curve_p, *curve_a, *curve_b;
+
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
+		((curve_a = BN_CTX_get(ctx)) == NULL) ||
+		((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
+	BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p);
+	BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a);
+	BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b);
+	if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
+		(BN_cmp(curve_b, b)))
+		{
+		ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE,
+			EC_R_WRONG_CURVE_PARAMETERS);
+		goto err;
+		}
+	group->field_mod_func = BN_nist_mod_224;
+	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
+err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
+ * (X', Y') = (X/Z^2, Y/Z^3) */
+int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group,
+	const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+	{
+	felem z1, z2, x_in, y_in, x_out, y_out;
+	widefelem tmp;
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
+			EC_R_POINT_AT_INFINITY);
+		return 0;
+		}
+	if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
+		(!BN_to_felem(z1, &point->Z))) return 0;
+	felem_inv(z2, z1);
+	felem_square(tmp, z2); felem_reduce(z1, tmp);
+	felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
+	felem_contract(x_out, x_in);
+	if (x != NULL)
+		{
+		if (!felem_to_BN(x, x_out)) {
+		ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
+			ERR_R_BN_LIB);
+		return 0;
+		}
+		}
+	felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
+	felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
+	felem_contract(y_out, y_in);
+	if (y != NULL)
+		{
+		if (!felem_to_BN(y, y_out)) {
+		ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES,
+			ERR_R_BN_LIB);
+		return 0;
+		}
+		}
+	return 1;
+	}
+
+static void make_points_affine(size_t num, felem points[/*num*/][3], felem tmp_felems[/*num+1*/])
+	{
+	/* Runs in constant time, unless an input is the point at infinity
+	 * (which normally shouldn't happen). */
+	ec_GFp_nistp_points_make_affine_internal(
+		num,
+		points,
+		sizeof(felem),
+		tmp_felems,
+		(void (*)(void *)) felem_one,
+		(int (*)(const void *)) felem_is_zero_int,
+		(void (*)(void *, const void *)) felem_assign,
+		(void (*)(void *, const void *)) felem_square_reduce,
+		(void (*)(void *, const void *, const void *)) felem_mul_reduce,
+		(void (*)(void *, const void *)) felem_inv,
+		(void (*)(void *, const void *)) felem_contract);
+	}
+
+/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
+ * Result is stored in r (r can equal one of the inputs). */
+int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r,
+	const BIGNUM *scalar, size_t num, const EC_POINT *points[],
+	const BIGNUM *scalars[], BN_CTX *ctx)
+	{
+	int ret = 0;
+	int j;
+	unsigned i;
+	int mixed = 0;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y, *z, *tmp_scalar;
+	felem_bytearray g_secret;
+	felem_bytearray *secrets = NULL;
+	felem (*pre_comp)[17][3] = NULL;
+	felem *tmp_felems = NULL;
+	felem_bytearray tmp;
+	unsigned num_bytes;
+	int have_pre_comp = 0;
+	size_t num_points = num;
+	felem x_in, y_in, z_in, x_out, y_out, z_out;
+	NISTP224_PRE_COMP *pre = NULL;
+	const felem (*g_pre_comp)[16][3] = NULL;
+	EC_POINT *generator = NULL;
+	const EC_POINT *p = NULL;
+	const BIGNUM *p_scalar = NULL;
+
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((x = BN_CTX_get(ctx)) == NULL) ||
+		((y = BN_CTX_get(ctx)) == NULL) ||
+		((z = BN_CTX_get(ctx)) == NULL) ||
+		((tmp_scalar = BN_CTX_get(ctx)) == NULL))
+		goto err;
+
+	if (scalar != NULL)
+		{
+		pre = EC_EX_DATA_get_data(group->extra_data,
+			nistp224_pre_comp_dup, nistp224_pre_comp_free,
+			nistp224_pre_comp_clear_free);
+		if (pre)
+			/* we have precomputation, try to use it */
+			g_pre_comp = (const felem (*)[16][3]) pre->g_pre_comp;
+		else
+			/* try to use the standard precomputation */
+			g_pre_comp = &gmul[0];
+		generator = EC_POINT_new(group);
+		if (generator == NULL)
+			goto err;
+		/* get the generator from precomputation */
+		if (!felem_to_BN(x, g_pre_comp[0][1][0]) ||
+			!felem_to_BN(y, g_pre_comp[0][1][1]) ||
+			!felem_to_BN(z, g_pre_comp[0][1][2]))
+			{
+			ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+			goto err;
+			}
+		if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
+				generator, x, y, z, ctx))
+			goto err;
+		if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+			/* precomputation matches generator */
+			have_pre_comp = 1;
+		else
+			/* we don't have valid precomputation:
+			 * treat the generator as a random point */
+			num_points = num_points + 1;
+		}
+
+	if (num_points > 0)
+		{
+		if (num_points >= 3)
+			{
+			/* unless we precompute multiples for just one or two points,
+			 * converting those into affine form is time well spent  */
+			mixed = 1;
+			}
+		secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
+		pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
+		if (mixed)
+			tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
+		if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
+			{
+			ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+
+		/* we treat NULL scalars as 0, and NULL points as points at infinity,
+		 * i.e., they contribute nothing to the linear combination */
+		memset(secrets, 0, num_points * sizeof(felem_bytearray));
+		memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
+		for (i = 0; i < num_points; ++i)
+			{
+			if (i == num)
+				/* the generator */
+				{
+				p = EC_GROUP_get0_generator(group);
+				p_scalar = scalar;
+				}
+			else
+				/* the i^th point */
+				{
+				p = points[i];
+				p_scalar = scalars[i];
+				}
+			if ((p_scalar != NULL) && (p != NULL))
+				{
+				/* reduce scalar to 0 <= scalar < 2^224 */
+				if ((BN_num_bits(p_scalar) > 224) || (BN_is_negative(p_scalar)))
+					{
+					/* this is an unusual input, and we don't guarantee
+					 * constant-timeness */
+					if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
+						{
+						ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+						goto err;
+						}
+					num_bytes = BN_bn2bin(tmp_scalar, tmp);
+					}
+				else
+					num_bytes = BN_bn2bin(p_scalar, tmp);
+				flip_endian(secrets[i], tmp, num_bytes);
+				/* precompute multiples */
+				if ((!BN_to_felem(x_out, &p->X)) ||
+					(!BN_to_felem(y_out, &p->Y)) ||
+					(!BN_to_felem(z_out, &p->Z))) goto err;
+				felem_assign(pre_comp[i][1][0], x_out);
+				felem_assign(pre_comp[i][1][1], y_out);
+				felem_assign(pre_comp[i][1][2], z_out);
+				for (j = 2; j <= 16; ++j)
+					{
+					if (j & 1)
+						{
+						point_add(
+							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+							pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
+							0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
+						}
+					else
+						{
+						point_double(
+							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+							pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
+						}
+					}
+				}
+			}
+		if (mixed)
+			make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
+		}
+
+	/* the scalar for the generator */
+	if ((scalar != NULL) && (have_pre_comp))
+		{
+		memset(g_secret, 0, sizeof g_secret);
+		/* reduce scalar to 0 <= scalar < 2^224 */
+		if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar)))
+			{
+			/* this is an unusual input, and we don't guarantee
+			 * constant-timeness */
+			if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
+				{
+				ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+				goto err;
+				}
+			num_bytes = BN_bn2bin(tmp_scalar, tmp);
+			}
+		else
+			num_bytes = BN_bn2bin(scalar, tmp);
+		flip_endian(g_secret, tmp, num_bytes);
+		/* do the multiplication with generator precomputation*/
+		batch_mul(x_out, y_out, z_out,
+			(const felem_bytearray (*)) secrets, num_points,
+			g_secret,
+			mixed, (const felem (*)[17][3]) pre_comp,
+			g_pre_comp);
+		}
+	else
+		/* do the multiplication without generator precomputation */
+		batch_mul(x_out, y_out, z_out,
+			(const felem_bytearray (*)) secrets, num_points,
+			NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
+	/* reduce the output to its unique minimal representation */
+	felem_contract(x_in, x_out);
+	felem_contract(y_in, y_out);
+	felem_contract(z_in, z_out);
+	if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
+		(!felem_to_BN(z, z_in)))
+		{
+		ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB);
+		goto err;
+		}
+	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
+
+err:
+	BN_CTX_end(ctx);
+	if (generator != NULL)
+		EC_POINT_free(generator);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	if (secrets != NULL)
+		OPENSSL_free(secrets);
+	if (pre_comp != NULL)
+		OPENSSL_free(pre_comp);
+	if (tmp_felems != NULL)
+		OPENSSL_free(tmp_felems);
+	return ret;
+	}
+
+int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
+	{
+	int ret = 0;
+	NISTP224_PRE_COMP *pre = NULL;
+	int i, j;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y;
+	EC_POINT *generator = NULL;
+	felem tmp_felems[32];
+
+	/* throw away old precomputation */
+	EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup,
+		nistp224_pre_comp_free, nistp224_pre_comp_clear_free);
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((x = BN_CTX_get(ctx)) == NULL) ||
+		((y = BN_CTX_get(ctx)) == NULL))
+		goto err;
+	/* get the generator */
+	if (group->generator == NULL) goto err;
+	generator = EC_POINT_new(group);
+	if (generator == NULL)
+		goto err;
+	BN_bin2bn(nistp224_curve_params[3], sizeof (felem_bytearray), x);
+	BN_bin2bn(nistp224_curve_params[4], sizeof (felem_bytearray), y);
+	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
+		goto err;
+	if ((pre = nistp224_pre_comp_new()) == NULL)
+		goto err;
+	/* if the generator is the standard one, use built-in precomputation */
+	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+		{
+		memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
+		ret = 1;
+		goto err;
+		}
+	if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) ||
+		(!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) ||
+		(!BN_to_felem(pre->g_pre_comp[0][1][2], &group->generator->Z)))
+		goto err;
+	/* compute 2^56*G, 2^112*G, 2^168*G for the first table,
+	 * 2^28*G, 2^84*G, 2^140*G, 2^196*G for the second one
+	 */
+	for (i = 1; i <= 8; i <<= 1)
+		{
+		point_double(
+			pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+			pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
+		for (j = 0; j < 27; ++j)
+			{
+			point_double(
+				pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+				pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+			}
+		if (i == 8)
+			break;
+		point_double(
+			pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+			pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+		for (j = 0; j < 27; ++j)
+			{
+			point_double(
+				pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+				pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]);
+			}
+		}
+	for (i = 0; i < 2; i++)
+		{
+		/* g_pre_comp[i][0] is the point at infinity */
+		memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
+		/* the remaining multiples */
+		/* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */
+		point_add(
+			pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1],
+			pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0],
+			pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
+			0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
+			pre->g_pre_comp[i][2][2]);
+		/* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */
+		point_add(
+			pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1],
+			pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0],
+			pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+			0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
+			pre->g_pre_comp[i][2][2]);
+		/* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */
+		point_add(
+			pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1],
+			pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0],
+			pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+			0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1],
+			pre->g_pre_comp[i][4][2]);
+		/* 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G */
+		point_add(
+			pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1],
+			pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0],
+			pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
+			0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1],
+			pre->g_pre_comp[i][2][2]);
+		for (j = 1; j < 8; ++j)
+			{
+			/* odd multiples: add G resp. 2^28*G */
+			point_add(
+				pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1],
+				pre->g_pre_comp[i][2*j+1][2], pre->g_pre_comp[i][2*j][0],
+				pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2],
+				0, pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1],
+				pre->g_pre_comp[i][1][2]);
+			}
+		}
+	make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems);
+
+	if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup,
+			nistp224_pre_comp_free, nistp224_pre_comp_clear_free))
+		goto err;
+	ret = 1;
+	pre = NULL;
+ err:
+	BN_CTX_end(ctx);
+	if (generator != NULL)
+		EC_POINT_free(generator);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	if (pre)
+		nistp224_pre_comp_free(pre);
+	return ret;
+	}
+
+int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group)
+	{
+	if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup,
+			nistp224_pre_comp_free, nistp224_pre_comp_clear_free)
+		!= NULL)
+		return 1;
+	else
+		return 0;
+	}
+
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_nistp256.c b/src/lib/libcrypto/ec/ecp_nistp256.c
new file mode 100644
index 0000000000..4bc0f5dce0
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistp256.c
@@ -0,0 +1,2171 @@
+/* crypto/ec/ecp_nistp256.c */
+/*
+ * Written by Adam Langley (Google) for the OpenSSL project
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication
+ *
+ * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
+ * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
+ * work which got its smarts from Daniel J. Bernstein's work on the same.
+ */
+
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+
+#ifndef OPENSSL_SYS_VMS
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+
+#include <string.h>
+#include <openssl/err.h>
+#include "ec_lcl.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+  /* even with gcc, the typedef won't work for 32-bit platforms */
+  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
+  typedef __int128_t int128_t;
+#else
+  #error "Need GCC 3.1 or later to define type uint128_t"
+#endif
+
+typedef uint8_t u8;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int64_t s64;
+
+/* The underlying field.
+ *
+ * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element
+ * of this field into 32 bytes. We call this an felem_bytearray. */
+
+typedef u8 felem_bytearray[32];
+
+/* These are the parameters of P256, taken from FIPS 186-3, page 86. These
+ * values are big-endian. */
+static const felem_bytearray nistp256_curve_params[5] = {
+	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,       /* p */
+	 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+	{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,       /* a = -3 */
+	 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc},      /* b */
+	{0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
+	 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
+	 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
+	 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b},
+	{0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47,       /* x */
+	 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
+	 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0,
+	 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96},
+	{0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b,       /* y */
+	 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16,
+	 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce,
+	 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5}
+};
+
+/* The representation of field elements.
+ * ------------------------------------
+ *
+ * We represent field elements with either four 128-bit values, eight 128-bit
+ * values, or four 64-bit values. The field element represented is:
+ *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192  (mod p)
+ * or:
+ *   v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512  (mod p)
+ *
+ * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits
+ * apart, but are 128-bits wide, the most significant bits of each limb overlap
+ * with the least significant bits of the next.
+ *
+ * A field element with four limbs is an 'felem'. One with eight limbs is a
+ * 'longfelem'
+ *
+ * A field element with four, 64-bit values is called a 'smallfelem'. Small
+ * values are used as intermediate values before multiplication.
+ */
+
+#define NLIMBS 4
+
+typedef uint128_t limb;
+typedef limb felem[NLIMBS];
+typedef limb longfelem[NLIMBS * 2];
+typedef u64 smallfelem[NLIMBS];
+
+/* This is the value of the prime as four 64-bit words, little-endian. */
+static const u64 kPrime[4] = { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul };
+static const limb bottom32bits = 0xffffffff;
+static const u64 bottom63bits = 0x7ffffffffffffffful;
+
+/* bin32_to_felem takes a little-endian byte array and converts it into felem
+ * form. This assumes that the CPU is little-endian. */
+static void bin32_to_felem(felem out, const u8 in[32])
+	{
+	out[0] = *((u64*) &in[0]);
+	out[1] = *((u64*) &in[8]);
+	out[2] = *((u64*) &in[16]);
+	out[3] = *((u64*) &in[24]);
+	}
+
+/* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian,
+ * 32 byte array. This assumes that the CPU is little-endian. */
+static void smallfelem_to_bin32(u8 out[32], const smallfelem in)
+	{
+	*((u64*) &out[0]) = in[0];
+	*((u64*) &out[8]) = in[1];
+	*((u64*) &out[16]) = in[2];
+	*((u64*) &out[24]) = in[3];
+	}
+
+/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
+static void flip_endian(u8 *out, const u8 *in, unsigned len)
+	{
+	unsigned i;
+	for (i = 0; i < len; ++i)
+		out[i] = in[len-1-i];
+	}
+
+/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
+static int BN_to_felem(felem out, const BIGNUM *bn)
+	{
+	felem_bytearray b_in;
+	felem_bytearray b_out;
+	unsigned num_bytes;
+
+	/* BN_bn2bin eats leading zeroes */
+	memset(b_out, 0, sizeof b_out);
+	num_bytes = BN_num_bytes(bn);
+	if (num_bytes > sizeof b_out)
+		{
+		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+		return 0;
+		}
+	if (BN_is_negative(bn))
+		{
+		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+		return 0;
+		}
+	num_bytes = BN_bn2bin(bn, b_in);
+	flip_endian(b_out, b_in, num_bytes);
+	bin32_to_felem(out, b_out);
+	return 1;
+	}
+
+/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
+static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in)
+	{
+	felem_bytearray b_in, b_out;
+	smallfelem_to_bin32(b_in, in);
+	flip_endian(b_out, b_in, sizeof b_out);
+	return BN_bin2bn(b_out, sizeof b_out, out);
+	}
+
+
+/* Field operations
+ * ---------------- */
+
+static void smallfelem_one(smallfelem out)
+	{
+	out[0] = 1;
+	out[1] = 0;
+	out[2] = 0;
+	out[3] = 0;
+	}
+
+static void smallfelem_assign(smallfelem out, const smallfelem in)
+	{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+	}
+
+static void felem_assign(felem out, const felem in)
+	{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+	}
+
+/* felem_sum sets out = out + in. */
+static void felem_sum(felem out, const felem in)
+	{
+	out[0] += in[0];
+	out[1] += in[1];
+	out[2] += in[2];
+	out[3] += in[3];
+	}
+
+/* felem_small_sum sets out = out + in. */
+static void felem_small_sum(felem out, const smallfelem in)
+	{
+	out[0] += in[0];
+	out[1] += in[1];
+	out[2] += in[2];
+	out[3] += in[3];
+	}
+
+/* felem_scalar sets out = out * scalar */
+static void felem_scalar(felem out, const u64 scalar)
+	{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+	}
+
+/* longfelem_scalar sets out = out * scalar */
+static void longfelem_scalar(longfelem out, const u64 scalar)
+	{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+	out[4] *= scalar;
+	out[5] *= scalar;
+	out[6] *= scalar;
+	out[7] *= scalar;
+	}
+
+#define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9)
+#define two105 (((limb)1) << 105)
+#define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9)
+
+/* zero105 is 0 mod p */
+static const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 };
+
+/* smallfelem_neg sets |out| to |-small|
+ * On exit:
+ *   out[i] < out[i] + 2^105
+ */
+static void smallfelem_neg(felem out, const smallfelem small)
+	{
+	/* In order to prevent underflow, we subtract from 0 mod p. */
+	out[0] = zero105[0] - small[0];
+	out[1] = zero105[1] - small[1];
+	out[2] = zero105[2] - small[2];
+	out[3] = zero105[3] - small[3];
+	}
+
+/* felem_diff subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^104
+ * On exit:
+ *   out[i] < out[i] + 2^105
+ */
+static void felem_diff(felem out, const felem in)
+	{
+	/* In order to prevent underflow, we add 0 mod p before subtracting. */
+	out[0] += zero105[0];
+	out[1] += zero105[1];
+	out[2] += zero105[2];
+	out[3] += zero105[3];
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+	}
+
+#define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11)
+#define two107 (((limb)1) << 107)
+#define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11)
+
+/* zero107 is 0 mod p */
+static const felem zero107 = { two107m43m11, two107, two107m43p11, two107m43p11 };
+
+/* An alternative felem_diff for larger inputs |in|
+ * felem_diff_zero107 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^106
+ * On exit:
+ *   out[i] < out[i] + 2^107
+ */
+static void felem_diff_zero107(felem out, const felem in)
+	{
+	/* In order to prevent underflow, we add 0 mod p before subtracting. */
+	out[0] += zero107[0];
+	out[1] += zero107[1];
+	out[2] += zero107[2];
+	out[3] += zero107[3];
+
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+	}
+
+/* longfelem_diff subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 7*2^67
+ * On exit:
+ *   out[i] < out[i] + 2^70 + 2^40
+ */
+static void longfelem_diff(longfelem out, const longfelem in)
+	{
+	static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6);
+	static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40);
+	static const limb two70 = (((limb)1) << 70);
+	static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6);
+	static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6);
+
+	/* add 0 mod p to avoid underflow */
+	out[0] += two70m8p6;
+	out[1] += two70p40;
+	out[2] += two70;
+	out[3] += two70m40m38p6;
+	out[4] += two70m6;
+	out[5] += two70m6;
+	out[6] += two70m6;
+	out[7] += two70m6;
+
+	/* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */
+	out[0] -= in[0];
+	out[1] -= in[1];
+	out[2] -= in[2];
+	out[3] -= in[3];
+	out[4] -= in[4];
+	out[5] -= in[5];
+	out[6] -= in[6];
+	out[7] -= in[7];
+	}
+
+#define two64m0 (((limb)1) << 64) - 1
+#define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1
+#define two64m46 (((limb)1) << 64) - (((limb)1) << 46)
+#define two64m32 (((limb)1) << 64) - (((limb)1) << 32)
+
+/* zero110 is 0 mod p */
+static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 };
+
+/* felem_shrink converts an felem into a smallfelem. The result isn't quite
+ * minimal as the value may be greater than p.
+ *
+ * On entry:
+ *   in[i] < 2^109
+ * On exit:
+ *   out[i] < 2^64
+ */
+static void felem_shrink(smallfelem out, const felem in)
+	{
+	felem tmp;
+	u64 a, b, mask;
+	s64 high, low;
+	static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */
+
+	/* Carry 2->3 */
+	tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64));
+	/* tmp[3] < 2^110 */
+
+	tmp[2] = zero110[2] + (u64) in[2];
+	tmp[0] = zero110[0] + in[0];
+	tmp[1] = zero110[1] + in[1];
+	/* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */
+
+	/* We perform two partial reductions where we eliminate the
+	 * high-word of tmp[3]. We don't update the other words till the end.
+	 */
+	a = tmp[3] >> 64; /* a < 2^46 */
+	tmp[3] = (u64) tmp[3];
+	tmp[3] -= a;
+	tmp[3] += ((limb)a) << 32;
+	/* tmp[3] < 2^79 */
+
+	b = a;
+	a = tmp[3] >> 64; /* a < 2^15 */
+	b += a; /* b < 2^46 + 2^15 < 2^47 */
+	tmp[3] = (u64) tmp[3];
+	tmp[3] -= a;
+	tmp[3] += ((limb)a) << 32;
+	/* tmp[3] < 2^64 + 2^47 */
+
+	/* This adjusts the other two words to complete the two partial
+	 * reductions. */
+	tmp[0] += b;
+	tmp[1] -= (((limb)b) << 32);
+
+	/* In order to make space in tmp[3] for the carry from 2 -> 3, we
+	 * conditionally subtract kPrime if tmp[3] is large enough. */
+	high = tmp[3] >> 64;
+	/* As tmp[3] < 2^65, high is either 1 or 0 */
+	high <<= 63;
+	high >>= 63;
+	/* high is:
+	 *   all ones   if the high word of tmp[3] is 1
+	 *   all zeros  if the high word of tmp[3] if 0 */
+	low = tmp[3];
+	mask = low >> 63;
+	/* mask is:
+	 *   all ones   if the MSB of low is 1
+	 *   all zeros  if the MSB of low if 0 */
+	low &= bottom63bits;
+	low -= kPrime3Test;
+	/* if low was greater than kPrime3Test then the MSB is zero */
+	low = ~low;
+	low >>= 63;
+	/* low is:
+	 *   all ones   if low was > kPrime3Test
+	 *   all zeros  if low was <= kPrime3Test */
+	mask = (mask & low) | high;
+	tmp[0] -= mask & kPrime[0];
+	tmp[1] -= mask & kPrime[1];
+	/* kPrime[2] is zero, so omitted */
+	tmp[3] -= mask & kPrime[3];
+	/* tmp[3] < 2**64 - 2**32 + 1 */
+
+	tmp[1] += ((u64) (tmp[0] >> 64)); tmp[0] = (u64) tmp[0];
+	tmp[2] += ((u64) (tmp[1] >> 64)); tmp[1] = (u64) tmp[1];
+	tmp[3] += ((u64) (tmp[2] >> 64)); tmp[2] = (u64) tmp[2];
+	/* tmp[i] < 2^64 */
+
+	out[0] = tmp[0];
+	out[1] = tmp[1];
+	out[2] = tmp[2];
+	out[3] = tmp[3];
+	}
+
+/* smallfelem_expand converts a smallfelem to an felem */
+static void smallfelem_expand(felem out, const smallfelem in)
+	{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+	}
+
+/* smallfelem_square sets |out| = |small|^2
+ * On entry:
+ *   small[i] < 2^64
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void smallfelem_square(longfelem out, const smallfelem small)
+	{
+	limb a;
+	u64 high, low;
+
+	a = ((uint128_t) small[0]) * small[0];
+	low = a;
+	high = a >> 64;
+	out[0] = low;
+	out[1] = high;
+
+	a = ((uint128_t) small[0]) * small[1];
+	low = a;
+	high = a >> 64;
+	out[1] += low;
+	out[1] += low;
+	out[2] = high;
+
+	a = ((uint128_t) small[0]) * small[2];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[2] *= 2;
+	out[3] = high;
+
+	a = ((uint128_t) small[0]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] = high;
+
+	a = ((uint128_t) small[1]) * small[2];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[3] *= 2;
+	out[4] += high;
+
+	a = ((uint128_t) small[1]) * small[1];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] += high;
+
+	a = ((uint128_t) small[1]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[4] *= 2;
+	out[5] = high;
+
+	a = ((uint128_t) small[2]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[5] += low;
+	out[5] *= 2;
+	out[6] = high;
+	out[6] += high;
+
+	a = ((uint128_t) small[2]) * small[2];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] += high;
+
+	a = ((uint128_t) small[3]) * small[3];
+	low = a;
+	high = a >> 64;
+	out[6] += low;
+	out[7] = high;
+	}
+
+/* felem_square sets |out| = |in|^2
+ * On entry:
+ *   in[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_square(longfelem out, const felem in)
+	{
+	u64 small[4];
+	felem_shrink(small, in);
+	smallfelem_square(out, small);
+	}
+
+/* smallfelem_mul sets |out| = |small1| * |small2|
+ * On entry:
+ *   small1[i] < 2^64
+ *   small2[i] < 2^64
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2)
+	{
+	limb a;
+	u64 high, low;
+
+	a = ((uint128_t) small1[0]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[0] = low;
+	out[1] = high;
+
+
+	a = ((uint128_t) small1[0]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[1] += low;
+	out[2] = high;
+
+	a = ((uint128_t) small1[1]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[1] += low;
+	out[2] += high;
+
+
+	a = ((uint128_t) small1[0]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] = high;
+
+	a = ((uint128_t) small1[1]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] += high;
+
+	a = ((uint128_t) small1[2]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[2] += low;
+	out[3] += high;
+
+
+	a = ((uint128_t) small1[0]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] = high;
+
+	a = ((uint128_t) small1[1]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] += high;
+
+	a = ((uint128_t) small1[2]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] += high;
+
+	a = ((uint128_t) small1[3]) * small2[0];
+	low = a;
+	high = a >> 64;
+	out[3] += low;
+	out[4] += high;
+
+
+	a = ((uint128_t) small1[1]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] = high;
+
+	a = ((uint128_t) small1[2]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] += high;
+
+	a = ((uint128_t) small1[3]) * small2[1];
+	low = a;
+	high = a >> 64;
+	out[4] += low;
+	out[5] += high;
+
+
+	a = ((uint128_t) small1[2]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[5] += low;
+	out[6] = high;
+
+	a = ((uint128_t) small1[3]) * small2[2];
+	low = a;
+	high = a >> 64;
+	out[5] += low;
+	out[6] += high;
+
+
+	a = ((uint128_t) small1[3]) * small2[3];
+	low = a;
+	high = a >> 64;
+	out[6] += low;
+	out[7] = high;
+	}
+
+/* felem_mul sets |out| = |in1| * |in2|
+ * On entry:
+ *   in1[i] < 2^109
+ *   in2[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_mul(longfelem out, const felem in1, const felem in2)
+	{
+	smallfelem small1, small2;
+	felem_shrink(small1, in1);
+	felem_shrink(small2, in2);
+	smallfelem_mul(out, small1, small2);
+	}
+
+/* felem_small_mul sets |out| = |small1| * |in2|
+ * On entry:
+ *   small1[i] < 2^64
+ *   in2[i] < 2^109
+ * On exit:
+ *   out[i] < 7 * 2^64 < 2^67
+ */
+static void felem_small_mul(longfelem out, const smallfelem small1, const felem in2)
+	{
+	smallfelem small2;
+	felem_shrink(small2, in2);
+	smallfelem_mul(out, small1, small2);
+	}
+
+#define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4)
+#define two100 (((limb)1) << 100)
+#define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4)
+/* zero100 is 0 mod p */
+static const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 };
+
+/* Internal function for the different flavours of felem_reduce.
+ * felem_reduce_ reduces the higher coefficients in[4]-in[7].
+ * On entry:
+ *   out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7] 
+ *   out[1] >= in[7] + 2^32*in[4]
+ *   out[2] >= in[5] + 2^32*in[5]
+ *   out[3] >= in[4] + 2^32*in[5] + 2^32*in[6]
+ * On exit:
+ *   out[0] <= out[0] + in[4] + 2^32*in[5]
+ *   out[1] <= out[1] + in[5] + 2^33*in[6]
+ *   out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7]
+ *   out[3] <= out[3] + 2^32*in[4] + 3*in[7]
+ */
+static void felem_reduce_(felem out, const longfelem in)
+	{
+	int128_t c;
+	/* combine common terms from below */
+	c = in[4] + (in[5] << 32);
+	out[0] += c;
+	out[3] -= c;
+
+	c = in[5] - in[7];
+	out[1] += c;
+	out[2] -= c;
+
+	/* the remaining terms */
+	/* 256: [(0,1),(96,-1),(192,-1),(224,1)] */
+	out[1] -= (in[4] << 32);
+	out[3] += (in[4] << 32);
+
+	/* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */
+	out[2] -= (in[5] << 32);
+
+	/* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */
+	out[0] -= in[6];
+	out[0] -= (in[6] << 32);
+	out[1] += (in[6] << 33);
+	out[2] += (in[6] * 2);
+	out[3] -= (in[6] << 32);
+
+	/* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */
+	out[0] -= in[7];
+	out[0] -= (in[7] << 32);
+	out[2] += (in[7] << 33);
+	out[3] += (in[7] * 3);
+	}
+
+/* felem_reduce converts a longfelem into an felem.
+ * To be called directly after felem_square or felem_mul.
+ * On entry:
+ *   in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64
+ *   in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64
+ * On exit:
+ *   out[i] < 2^101
+ */
+static void felem_reduce(felem out, const longfelem in)
+	{
+	out[0] = zero100[0] + in[0];
+	out[1] = zero100[1] + in[1];
+	out[2] = zero100[2] + in[2];
+	out[3] = zero100[3] + in[3];
+
+	felem_reduce_(out, in);
+
+	/* out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0
+	 * out[1] > 2^100 - 2^64 - 7*2^96 > 0
+	 * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0
+	 * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0
+	 *
+	 * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101
+	 * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101
+	 * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101
+	 * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101
+	 */
+	}
+
+/* felem_reduce_zero105 converts a larger longfelem into an felem.
+ * On entry:
+ *   in[0] < 2^71
+ * On exit:
+ *   out[i] < 2^106
+ */
+static void felem_reduce_zero105(felem out, const longfelem in)
+	{
+	out[0] = zero105[0] + in[0];
+	out[1] = zero105[1] + in[1];
+	out[2] = zero105[2] + in[2];
+	out[3] = zero105[3] + in[3];
+
+	felem_reduce_(out, in);
+
+	/* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0
+	 * out[1] > 2^105 - 2^71 - 2^103 > 0
+	 * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0
+	 * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0
+	 *
+	 * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
+	 * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106
+	 * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106
+	 * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106
+	 */
+	}
+
+/* subtract_u64 sets *result = *result - v and *carry to one if the subtraction
+ * underflowed. */
+static void subtract_u64(u64* result, u64* carry, u64 v)
+	{
+	uint128_t r = *result;
+	r -= v;
+	*carry = (r >> 64) & 1;
+	*result = (u64) r;
+	}
+
+/* felem_contract converts |in| to its unique, minimal representation.
+ * On entry:
+ *   in[i] < 2^109
+ */
+static void felem_contract(smallfelem out, const felem in)
+	{
+	unsigned i;
+	u64 all_equal_so_far = 0, result = 0, carry;
+
+	felem_shrink(out, in);
+	/* small is minimal except that the value might be > p */
+
+	all_equal_so_far--;
+	/* We are doing a constant time test if out >= kPrime. We need to
+	 * compare each u64, from most-significant to least significant. For
+	 * each one, if all words so far have been equal (m is all ones) then a
+	 * non-equal result is the answer. Otherwise we continue. */
+	for (i = 3; i < 4; i--)
+		{
+		u64 equal;
+		uint128_t a = ((uint128_t) kPrime[i]) - out[i];
+		/* if out[i] > kPrime[i] then a will underflow and the high
+		 * 64-bits will all be set. */
+		result |= all_equal_so_far & ((u64) (a >> 64));
+
+		/* if kPrime[i] == out[i] then |equal| will be all zeros and
+		 * the decrement will make it all ones. */
+		equal = kPrime[i] ^ out[i];
+		equal--;
+		equal &= equal << 32;
+		equal &= equal << 16;
+		equal &= equal << 8;
+		equal &= equal << 4;
+		equal &= equal << 2;
+		equal &= equal << 1;
+		equal = ((s64) equal) >> 63;
+
+		all_equal_so_far &= equal;
+		}
+
+	/* if all_equal_so_far is still all ones then the two values are equal
+	 * and so out >= kPrime is true. */
+	result |= all_equal_so_far;
+
+	/* if out >= kPrime then we subtract kPrime. */
+	subtract_u64(&out[0], &carry, result & kPrime[0]);
+	subtract_u64(&out[1], &carry, carry);
+	subtract_u64(&out[2], &carry, carry);
+	subtract_u64(&out[3], &carry, carry);
+
+	subtract_u64(&out[1], &carry, result & kPrime[1]);
+	subtract_u64(&out[2], &carry, carry);
+	subtract_u64(&out[3], &carry, carry);
+
+	subtract_u64(&out[2], &carry, result & kPrime[2]);
+	subtract_u64(&out[3], &carry, carry);
+
+	subtract_u64(&out[3], &carry, result & kPrime[3]);
+	}
+
+static void smallfelem_square_contract(smallfelem out, const smallfelem in)
+	{
+	longfelem longtmp;
+	felem tmp;
+
+	smallfelem_square(longtmp, in);
+	felem_reduce(tmp, longtmp);
+	felem_contract(out, tmp);
+	}
+
+static void smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2)
+	{
+	longfelem longtmp;
+	felem tmp;
+
+	smallfelem_mul(longtmp, in1, in2);
+	felem_reduce(tmp, longtmp);
+	felem_contract(out, tmp);
+	}
+
+/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
+ * otherwise.
+ * On entry:
+ *   small[i] < 2^64
+ */
+static limb smallfelem_is_zero(const smallfelem small)
+	{
+	limb result;
+	u64 is_p;
+
+	u64 is_zero = small[0] | small[1] | small[2] | small[3];
+	is_zero--;
+	is_zero &= is_zero << 32;
+	is_zero &= is_zero << 16;
+	is_zero &= is_zero << 8;
+	is_zero &= is_zero << 4;
+	is_zero &= is_zero << 2;
+	is_zero &= is_zero << 1;
+	is_zero = ((s64) is_zero) >> 63;
+
+	is_p = (small[0] ^ kPrime[0]) |
+	       (small[1] ^ kPrime[1]) |
+	       (small[2] ^ kPrime[2]) |
+	       (small[3] ^ kPrime[3]);
+	is_p--;
+	is_p &= is_p << 32;
+	is_p &= is_p << 16;
+	is_p &= is_p << 8;
+	is_p &= is_p << 4;
+	is_p &= is_p << 2;
+	is_p &= is_p << 1;
+	is_p = ((s64) is_p) >> 63;
+
+	is_zero |= is_p;
+
+	result = is_zero;
+	result |= ((limb) is_zero) << 64;
+	return result;
+	}
+
+static int smallfelem_is_zero_int(const smallfelem small)
+	{
+	return (int) (smallfelem_is_zero(small) & ((limb)1));
+	}
+
+/* felem_inv calculates |out| = |in|^{-1}
+ *
+ * Based on Fermat's Little Theorem:
+ *   a^p = a (mod p)
+ *   a^{p-1} = 1 (mod p)
+ *   a^{p-2} = a^{-1} (mod p)
+ */
+static void felem_inv(felem out, const felem in)
+	{
+	felem ftmp, ftmp2;
+	/* each e_I will hold |in|^{2^I - 1} */
+	felem e2, e4, e8, e16, e32, e64;
+	longfelem tmp;
+	unsigned i;
+
+	felem_square(tmp, in); felem_reduce(ftmp, tmp);			/* 2^1 */
+	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);		/* 2^2 - 2^0 */
+	felem_assign(e2, ftmp);
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^3 - 2^1 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^4 - 2^2 */
+	felem_mul(tmp, ftmp, e2); felem_reduce(ftmp, tmp);		/* 2^4 - 2^0 */
+	felem_assign(e4, ftmp);
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^5 - 2^1 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^6 - 2^2 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^7 - 2^3 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);		/* 2^8 - 2^4 */
+	felem_mul(tmp, ftmp, e4); felem_reduce(ftmp, tmp);		/* 2^8 - 2^0 */
+	felem_assign(e8, ftmp);
+	for (i = 0; i < 8; i++) {
+		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+	}								/* 2^16 - 2^8 */
+	felem_mul(tmp, ftmp, e8); felem_reduce(ftmp, tmp);		/* 2^16 - 2^0 */
+	felem_assign(e16, ftmp);
+	for (i = 0; i < 16; i++) {
+		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+	}								/* 2^32 - 2^16 */
+	felem_mul(tmp, ftmp, e16); felem_reduce(ftmp, tmp);		/* 2^32 - 2^0 */
+	felem_assign(e32, ftmp);
+	for (i = 0; i < 32; i++) {
+		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+	}								/* 2^64 - 2^32 */
+	felem_assign(e64, ftmp);
+	felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp);		/* 2^64 - 2^32 + 2^0 */
+	for (i = 0; i < 192; i++) {
+		felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);
+	}								/* 2^256 - 2^224 + 2^192 */
+
+	felem_mul(tmp, e64, e32); felem_reduce(ftmp2, tmp);		/* 2^64 - 2^0 */
+	for (i = 0; i < 16; i++) {
+		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+	}								/* 2^80 - 2^16 */
+	felem_mul(tmp, ftmp2, e16); felem_reduce(ftmp2, tmp);		/* 2^80 - 2^0 */
+	for (i = 0; i < 8; i++) {
+		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+	}								/* 2^88 - 2^8 */
+	felem_mul(tmp, ftmp2, e8); felem_reduce(ftmp2, tmp);		/* 2^88 - 2^0 */
+	for (i = 0; i < 4; i++) {
+		felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);
+	}								/* 2^92 - 2^4 */
+	felem_mul(tmp, ftmp2, e4); felem_reduce(ftmp2, tmp);		/* 2^92 - 2^0 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^93 - 2^1 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^94 - 2^2 */
+	felem_mul(tmp, ftmp2, e2); felem_reduce(ftmp2, tmp);		/* 2^94 - 2^0 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^95 - 2^1 */
+	felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp);		/* 2^96 - 2^2 */
+	felem_mul(tmp, ftmp2, in); felem_reduce(ftmp2, tmp);		/* 2^96 - 3 */
+
+	felem_mul(tmp, ftmp2, ftmp); felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */
+	}
+
+static void smallfelem_inv_contract(smallfelem out, const smallfelem in)
+	{
+	felem tmp;
+
+	smallfelem_expand(tmp, in);
+	felem_inv(tmp, tmp);
+	felem_contract(out, tmp);
+	}
+
+/* Group operations
+ * ----------------
+ *
+ * Building on top of the field operations we have the operations on the
+ * elliptic curve group itself. Points on the curve are represented in Jacobian
+ * coordinates */
+
+/* point_double calculates 2*(x_in, y_in, z_in)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+ *
+ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
+ * while x_out == y_in is not (maybe this works, but it's not tested). */
+static void
+point_double(felem x_out, felem y_out, felem z_out,
+	     const felem x_in, const felem y_in, const felem z_in)
+	{
+	longfelem tmp, tmp2;
+	felem delta, gamma, beta, alpha, ftmp, ftmp2;
+	smallfelem small1, small2;
+
+	felem_assign(ftmp, x_in);
+	/* ftmp[i] < 2^106 */
+	felem_assign(ftmp2, x_in);
+	/* ftmp2[i] < 2^106 */
+
+	/* delta = z^2 */
+	felem_square(tmp, z_in);
+	felem_reduce(delta, tmp);
+	/* delta[i] < 2^101 */
+
+	/* gamma = y^2 */
+	felem_square(tmp, y_in);
+	felem_reduce(gamma, tmp);
+	/* gamma[i] < 2^101 */
+	felem_shrink(small1, gamma);
+
+	/* beta = x*gamma */
+	felem_small_mul(tmp, small1, x_in);
+	felem_reduce(beta, tmp);
+	/* beta[i] < 2^101 */
+
+	/* alpha = 3*(x-delta)*(x+delta) */
+	felem_diff(ftmp, delta);
+	/* ftmp[i] < 2^105 + 2^106 < 2^107 */
+	felem_sum(ftmp2, delta);
+	/* ftmp2[i] < 2^105 + 2^106 < 2^107 */
+	felem_scalar(ftmp2, 3);
+	/* ftmp2[i] < 3 * 2^107 < 2^109 */
+	felem_mul(tmp, ftmp, ftmp2);
+	felem_reduce(alpha, tmp);
+	/* alpha[i] < 2^101 */
+	felem_shrink(small2, alpha);
+
+	/* x' = alpha^2 - 8*beta */
+	smallfelem_square(tmp, small2);
+	felem_reduce(x_out, tmp);
+	felem_assign(ftmp, beta);
+	felem_scalar(ftmp, 8);
+	/* ftmp[i] < 8 * 2^101 = 2^104 */
+	felem_diff(x_out, ftmp);
+	/* x_out[i] < 2^105 + 2^101 < 2^106 */
+
+	/* z' = (y + z)^2 - gamma - delta */
+	felem_sum(delta, gamma);
+	/* delta[i] < 2^101 + 2^101 = 2^102 */
+	felem_assign(ftmp, y_in);
+	felem_sum(ftmp, z_in);
+	/* ftmp[i] < 2^106 + 2^106 = 2^107 */
+	felem_square(tmp, ftmp);
+	felem_reduce(z_out, tmp);
+	felem_diff(z_out, delta);
+	/* z_out[i] < 2^105 + 2^101 < 2^106 */
+
+	/* y' = alpha*(4*beta - x') - 8*gamma^2 */
+	felem_scalar(beta, 4);
+	/* beta[i] < 4 * 2^101 = 2^103 */
+	felem_diff_zero107(beta, x_out);
+	/* beta[i] < 2^107 + 2^103 < 2^108 */
+	felem_small_mul(tmp, small2, beta);
+	/* tmp[i] < 7 * 2^64 < 2^67 */
+	smallfelem_square(tmp2, small1);
+	/* tmp2[i] < 7 * 2^64 */
+	longfelem_scalar(tmp2, 8);
+	/* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */
+	longfelem_diff(tmp, tmp2);
+	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
+	felem_reduce_zero105(y_out, tmp);
+	/* y_out[i] < 2^106 */
+	}
+
+/* point_double_small is the same as point_double, except that it operates on
+ * smallfelems */
+static void
+point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out,
+		   const smallfelem x_in, const smallfelem y_in, const smallfelem z_in)
+	{
+	felem felem_x_out, felem_y_out, felem_z_out;
+	felem felem_x_in, felem_y_in, felem_z_in;
+
+	smallfelem_expand(felem_x_in, x_in);
+	smallfelem_expand(felem_y_in, y_in);
+	smallfelem_expand(felem_z_in, z_in);
+	point_double(felem_x_out, felem_y_out, felem_z_out,
+		     felem_x_in, felem_y_in, felem_z_in);
+	felem_shrink(x_out, felem_x_out);
+	felem_shrink(y_out, felem_y_out);
+	felem_shrink(z_out, felem_z_out);
+	}
+
+/* copy_conditional copies in to out iff mask is all ones. */
+static void
+copy_conditional(felem out, const felem in, limb mask)
+	{
+	unsigned i;
+	for (i = 0; i < NLIMBS; ++i)
+		{
+		const limb tmp = mask & (in[i] ^ out[i]);
+		out[i] ^= tmp;
+		}
+	}
+
+/* copy_small_conditional copies in to out iff mask is all ones. */
+static void
+copy_small_conditional(felem out, const smallfelem in, limb mask)
+	{
+	unsigned i;
+	const u64 mask64 = mask;
+	for (i = 0; i < NLIMBS; ++i)
+		{
+		out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask);
+		}
+	}
+
+/* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
+ * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
+ *
+ * This function includes a branch for checking whether the two input points
+ * are equal, (while not equal to the point at infinity). This case never
+ * happens during single point multiplication, so there is no timing leak for
+ * ECDH or ECDSA signing. */
+static void point_add(felem x3, felem y3, felem z3,
+	const felem x1, const felem y1, const felem z1,
+	const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2)
+	{
+	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
+	longfelem tmp, tmp2;
+	smallfelem small1, small2, small3, small4, small5;
+	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
+
+	felem_shrink(small3, z1);
+
+	z1_is_zero = smallfelem_is_zero(small3);
+	z2_is_zero = smallfelem_is_zero(z2);
+
+	/* ftmp = z1z1 = z1**2 */
+	smallfelem_square(tmp, small3);
+	felem_reduce(ftmp, tmp);
+	/* ftmp[i] < 2^101 */
+	felem_shrink(small1, ftmp);
+
+	if(!mixed)
+		{
+		/* ftmp2 = z2z2 = z2**2 */
+		smallfelem_square(tmp, z2);
+		felem_reduce(ftmp2, tmp);
+		/* ftmp2[i] < 2^101 */
+		felem_shrink(small2, ftmp2);
+
+		felem_shrink(small5, x1);
+
+		/* u1 = ftmp3 = x1*z2z2 */
+		smallfelem_mul(tmp, small5, small2);
+		felem_reduce(ftmp3, tmp);
+		/* ftmp3[i] < 2^101 */
+
+		/* ftmp5 = z1 + z2 */
+		felem_assign(ftmp5, z1);
+		felem_small_sum(ftmp5, z2);
+		/* ftmp5[i] < 2^107 */
+
+		/* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */
+		felem_square(tmp, ftmp5);
+		felem_reduce(ftmp5, tmp);
+		/* ftmp2 = z2z2 + z1z1 */
+		felem_sum(ftmp2, ftmp);
+		/* ftmp2[i] < 2^101 + 2^101 = 2^102 */
+		felem_diff(ftmp5, ftmp2);
+		/* ftmp5[i] < 2^105 + 2^101 < 2^106 */
+
+		/* ftmp2 = z2 * z2z2 */
+		smallfelem_mul(tmp, small2, z2);
+		felem_reduce(ftmp2, tmp);
+
+		/* s1 = ftmp2 = y1 * z2**3 */
+		felem_mul(tmp, y1, ftmp2);
+		felem_reduce(ftmp6, tmp);
+		/* ftmp6[i] < 2^101 */
+		}
+	else
+		{
+		/* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
+
+		/* u1 = ftmp3 = x1*z2z2 */
+		felem_assign(ftmp3, x1);
+		/* ftmp3[i] < 2^106 */
+
+		/* ftmp5 = 2z1z2 */
+		felem_assign(ftmp5, z1);
+		felem_scalar(ftmp5, 2);
+		/* ftmp5[i] < 2*2^106 = 2^107 */
+
+		/* s1 = ftmp2 = y1 * z2**3 */
+		felem_assign(ftmp6, y1);
+		/* ftmp6[i] < 2^106 */
+		}
+
+	/* u2 = x2*z1z1 */
+	smallfelem_mul(tmp, x2, small1);
+	felem_reduce(ftmp4, tmp);
+
+	/* h = ftmp4 = u2 - u1 */
+	felem_diff_zero107(ftmp4, ftmp3);
+	/* ftmp4[i] < 2^107 + 2^101 < 2^108 */
+	felem_shrink(small4, ftmp4);
+
+	x_equal = smallfelem_is_zero(small4);
+
+	/* z_out = ftmp5 * h */
+	felem_small_mul(tmp, small4, ftmp5);
+	felem_reduce(z_out, tmp);
+	/* z_out[i] < 2^101 */
+
+	/* ftmp = z1 * z1z1 */
+	smallfelem_mul(tmp, small1, small3);
+	felem_reduce(ftmp, tmp);
+
+	/* s2 = tmp = y2 * z1**3 */
+	felem_small_mul(tmp, y2, ftmp);
+	felem_reduce(ftmp5, tmp);
+
+	/* r = ftmp5 = (s2 - s1)*2 */
+	felem_diff_zero107(ftmp5, ftmp6);
+	/* ftmp5[i] < 2^107 + 2^107 = 2^108*/
+	felem_scalar(ftmp5, 2);
+	/* ftmp5[i] < 2^109 */
+	felem_shrink(small1, ftmp5);
+	y_equal = smallfelem_is_zero(small1);
+
+	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
+		{
+		point_double(x3, y3, z3, x1, y1, z1);
+		return;
+		}
+
+	/* I = ftmp = (2h)**2 */
+	felem_assign(ftmp, ftmp4);
+	felem_scalar(ftmp, 2);
+	/* ftmp[i] < 2*2^108 = 2^109 */
+	felem_square(tmp, ftmp);
+	felem_reduce(ftmp, tmp);
+
+	/* J = ftmp2 = h * I */
+	felem_mul(tmp, ftmp4, ftmp);
+	felem_reduce(ftmp2, tmp);
+
+	/* V = ftmp4 = U1 * I */
+	felem_mul(tmp, ftmp3, ftmp);
+	felem_reduce(ftmp4, tmp);
+
+	/* x_out = r**2 - J - 2V */
+	smallfelem_square(tmp, small1);
+	felem_reduce(x_out, tmp);
+	felem_assign(ftmp3, ftmp4);
+	felem_scalar(ftmp4, 2);
+	felem_sum(ftmp4, ftmp2);
+	/* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */
+	felem_diff(x_out, ftmp4);
+	/* x_out[i] < 2^105 + 2^101 */
+
+	/* y_out = r(V-x_out) - 2 * s1 * J */
+	felem_diff_zero107(ftmp3, x_out);
+	/* ftmp3[i] < 2^107 + 2^101 < 2^108 */
+	felem_small_mul(tmp, small1, ftmp3);
+	felem_mul(tmp2, ftmp6, ftmp2);
+	longfelem_scalar(tmp2, 2);
+	/* tmp2[i] < 2*2^67 = 2^68 */
+	longfelem_diff(tmp, tmp2);
+	/* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */
+	felem_reduce_zero105(y_out, tmp);
+	/* y_out[i] < 2^106 */
+
+	copy_small_conditional(x_out, x2, z1_is_zero);
+	copy_conditional(x_out, x1, z2_is_zero);
+	copy_small_conditional(y_out, y2, z1_is_zero);
+	copy_conditional(y_out, y1, z2_is_zero);
+	copy_small_conditional(z_out, z2, z1_is_zero);
+	copy_conditional(z_out, z1, z2_is_zero);
+	felem_assign(x3, x_out);
+	felem_assign(y3, y_out);
+	felem_assign(z3, z_out);
+	}
+
+/* point_add_small is the same as point_add, except that it operates on
+ * smallfelems */
+static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3,
+			    smallfelem x1, smallfelem y1, smallfelem z1,
+			    smallfelem x2, smallfelem y2, smallfelem z2)
+	{
+	felem felem_x3, felem_y3, felem_z3;
+	felem felem_x1, felem_y1, felem_z1;
+	smallfelem_expand(felem_x1, x1);
+	smallfelem_expand(felem_y1, y1);
+	smallfelem_expand(felem_z1, z1);
+	point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2);
+	felem_shrink(x3, felem_x3);
+	felem_shrink(y3, felem_y3);
+	felem_shrink(z3, felem_z3);
+	}
+
+/* Base point pre computation
+ * --------------------------
+ *
+ * Two different sorts of precomputed tables are used in the following code.
+ * Each contain various points on the curve, where each point is three field
+ * elements (x, y, z).
+ *
+ * For the base point table, z is usually 1 (0 for the point at infinity).
+ * This table has 2 * 16 elements, starting with the following:
+ * index | bits    | point
+ * ------+---------+------------------------------
+ *     0 | 0 0 0 0 | 0G
+ *     1 | 0 0 0 1 | 1G
+ *     2 | 0 0 1 0 | 2^64G
+ *     3 | 0 0 1 1 | (2^64 + 1)G
+ *     4 | 0 1 0 0 | 2^128G
+ *     5 | 0 1 0 1 | (2^128 + 1)G
+ *     6 | 0 1 1 0 | (2^128 + 2^64)G
+ *     7 | 0 1 1 1 | (2^128 + 2^64 + 1)G
+ *     8 | 1 0 0 0 | 2^192G
+ *     9 | 1 0 0 1 | (2^192 + 1)G
+ *    10 | 1 0 1 0 | (2^192 + 2^64)G
+ *    11 | 1 0 1 1 | (2^192 + 2^64 + 1)G
+ *    12 | 1 1 0 0 | (2^192 + 2^128)G
+ *    13 | 1 1 0 1 | (2^192 + 2^128 + 1)G
+ *    14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G
+ *    15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G
+ * followed by a copy of this with each element multiplied by 2^32.
+ *
+ * The reason for this is so that we can clock bits into four different
+ * locations when doing simple scalar multiplies against the base point,
+ * and then another four locations using the second 16 elements.
+ *
+ * Tables for other points have table[i] = iG for i in 0 .. 16. */
+
+/* gmul is the table of precomputed base points */
+static const smallfelem gmul[2][16][3] =
+{{{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247},
+   {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b},
+   {1, 0, 0, 0}},
+  {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5},
+   {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d},
+   {1, 0, 0, 0}},
+  {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f},
+   {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644},
+   {1, 0, 0, 0}},
+  {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67},
+   {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee},
+   {1, 0, 0, 0}},
+  {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff},
+   {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b},
+   {1, 0, 0, 0}},
+  {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8},
+   {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851},
+   {1, 0, 0, 0}},
+  {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea},
+   {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b},
+   {1, 0, 0, 0}},
+  {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276},
+   {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816},
+   {1, 0, 0, 0}},
+  {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad},
+   {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663},
+   {1, 0, 0, 0}},
+  {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d},
+   {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321},
+   {1, 0, 0, 0}},
+  {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287},
+   {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6},
+   {1, 0, 0, 0}},
+  {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466},
+   {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20},
+   {1, 0, 0, 0}},
+  {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9},
+   {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61},
+   {1, 0, 0, 0}},
+  {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a},
+   {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc},
+   {1, 0, 0, 0}},
+  {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c},
+   {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab},
+   {1, 0, 0, 0}}},
+ {{{0, 0, 0, 0},
+   {0, 0, 0, 0},
+   {0, 0, 0, 0}},
+  {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89},
+   {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624},
+   {1, 0, 0, 0}},
+  {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6},
+   {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1},
+   {1, 0, 0, 0}},
+  {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a},
+   {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593},
+   {1, 0, 0, 0}},
+  {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617},
+   {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7},
+   {1, 0, 0, 0}},
+  {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276},
+   {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a},
+   {1, 0, 0, 0}},
+  {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908},
+   {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e},
+   {1, 0, 0, 0}},
+  {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7},
+   {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec},
+   {1, 0, 0, 0}},
+  {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee},
+   {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6},
+   {1, 0, 0, 0}},
+  {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109},
+   {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5},
+   {1, 0, 0, 0}},
+  {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba},
+   {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44},
+   {1, 0, 0, 0}},
+  {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b},
+   {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc},
+   {1, 0, 0, 0}},
+  {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107},
+   {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387},
+   {1, 0, 0, 0}},
+  {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503},
+   {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be},
+   {1, 0, 0, 0}},
+  {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9},
+   {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a},
+   {1, 0, 0, 0}},
+  {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6},
+   {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81},
+   {1, 0, 0, 0}}}};
+
+/* select_point selects the |idx|th point from a precomputation table and
+ * copies it to out. */
+static void select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3])
+	{
+	unsigned i, j;
+	u64 *outlimbs = &out[0][0];
+	memset(outlimbs, 0, 3 * sizeof(smallfelem));
+
+	for (i = 0; i < size; i++)
+		{
+		const u64 *inlimbs = (u64*) &pre_comp[i][0][0];
+		u64 mask = i ^ idx;
+		mask |= mask >> 4;
+		mask |= mask >> 2;
+		mask |= mask >> 1;
+		mask &= 1;
+		mask--;
+		for (j = 0; j < NLIMBS * 3; j++)
+			outlimbs[j] |= inlimbs[j] & mask;
+		}
+	}
+
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, int i)
+	{
+	if ((i < 0) || (i >= 256))
+		return 0;
+	return (in[i >> 3] >> (i & 7)) & 1;
+	}
+
+/* Interleaved point multiplication using precomputed point multiples:
+ * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[],
+ * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
+ * of the generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
+static void batch_mul(felem x_out, felem y_out, felem z_out,
+	const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
+	const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3])
+	{
+	int i, skip;
+	unsigned num, gen_mul = (g_scalar != NULL);
+	felem nq[3], ftmp;
+	smallfelem tmp[3];
+	u64 bits;
+	u8 sign, digit;
+
+	/* set nq to the point at infinity */
+	memset(nq, 0, 3 * sizeof(felem));
+
+	/* Loop over all scalars msb-to-lsb, interleaving additions
+	 * of multiples of the generator (two in each of the last 32 rounds)
+	 * and additions of other points multiples (every 5th round).
+	 */
+	skip = 1; /* save two point operations in the first round */
+	for (i = (num_points ? 255 : 31); i >= 0; --i)
+		{
+		/* double */
+		if (!skip)
+			point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+
+		/* add multiples of the generator */
+		if (gen_mul && (i <= 31))
+			{
+			/* first, look 32 bits upwards */
+			bits = get_bit(g_scalar, i + 224) << 3;
+			bits |= get_bit(g_scalar, i + 160) << 2;
+			bits |= get_bit(g_scalar, i + 96) << 1;
+			bits |= get_bit(g_scalar, i + 32);
+			/* select the point to add, in constant time */
+			select_point(bits, 16, g_pre_comp[1], tmp);
+
+			if (!skip)
+				{
+				point_add(nq[0], nq[1], nq[2],
+					nq[0], nq[1], nq[2],
+					1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+				}
+			else
+				{
+				smallfelem_expand(nq[0], tmp[0]);
+				smallfelem_expand(nq[1], tmp[1]);
+				smallfelem_expand(nq[2], tmp[2]);
+				skip = 0;
+				}
+
+			/* second, look at the current position */
+			bits = get_bit(g_scalar, i + 192) << 3;
+			bits |= get_bit(g_scalar, i + 128) << 2;
+			bits |= get_bit(g_scalar, i + 64) << 1;
+			bits |= get_bit(g_scalar, i);
+			/* select the point to add, in constant time */
+			select_point(bits, 16, g_pre_comp[0], tmp);
+			point_add(nq[0], nq[1], nq[2],
+				nq[0], nq[1], nq[2],
+				1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+			}
+
+		/* do other additions every 5 doublings */
+		if (num_points && (i % 5 == 0))
+			{
+			/* loop over all scalars */
+			for (num = 0; num < num_points; ++num)
+				{
+				bits = get_bit(scalars[num], i + 4) << 5;
+				bits |= get_bit(scalars[num], i + 3) << 4;
+				bits |= get_bit(scalars[num], i + 2) << 3;
+				bits |= get_bit(scalars[num], i + 1) << 2;
+				bits |= get_bit(scalars[num], i) << 1;
+				bits |= get_bit(scalars[num], i - 1);
+				ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+
+				/* select the point to add or subtract, in constant time */
+				select_point(digit, 17, pre_comp[num], tmp);
+				smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative point */
+				copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1));
+				felem_contract(tmp[1], ftmp);
+
+				if (!skip)
+					{
+					point_add(nq[0], nq[1], nq[2],
+						nq[0], nq[1], nq[2],
+						mixed, tmp[0], tmp[1], tmp[2]);
+					}
+				else
+					{
+					smallfelem_expand(nq[0], tmp[0]);
+					smallfelem_expand(nq[1], tmp[1]);
+					smallfelem_expand(nq[2], tmp[2]);
+					skip = 0;
+					}
+				}
+			}
+		}
+	felem_assign(x_out, nq[0]);
+	felem_assign(y_out, nq[1]);
+	felem_assign(z_out, nq[2]);
+	}
+
+/* Precomputation for the group generator. */
+typedef struct {
+	smallfelem g_pre_comp[2][16][3];
+	int references;
+} NISTP256_PRE_COMP;
+
+const EC_METHOD *EC_GFp_nistp256_method(void)
+	{
+	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
+		NID_X9_62_prime_field,
+		ec_GFp_nistp256_group_init,
+		ec_GFp_simple_group_finish,
+		ec_GFp_simple_group_clear_finish,
+		ec_GFp_nist_group_copy,
+		ec_GFp_nistp256_group_set_curve,
+		ec_GFp_simple_group_get_curve,
+		ec_GFp_simple_group_get_degree,
+		ec_GFp_simple_group_check_discriminant,
+		ec_GFp_simple_point_init,
+		ec_GFp_simple_point_finish,
+		ec_GFp_simple_point_clear_finish,
+		ec_GFp_simple_point_copy,
+		ec_GFp_simple_point_set_to_infinity,
+		ec_GFp_simple_set_Jprojective_coordinates_GFp,
+		ec_GFp_simple_get_Jprojective_coordinates_GFp,
+		ec_GFp_simple_point_set_affine_coordinates,
+		ec_GFp_nistp256_point_get_affine_coordinates,
+		0 /* point_set_compressed_coordinates */,
+		0 /* point2oct */,
+		0 /* oct2point */,
+		ec_GFp_simple_add,
+		ec_GFp_simple_dbl,
+		ec_GFp_simple_invert,
+		ec_GFp_simple_is_at_infinity,
+		ec_GFp_simple_is_on_curve,
+		ec_GFp_simple_cmp,
+		ec_GFp_simple_make_affine,
+		ec_GFp_simple_points_make_affine,
+		ec_GFp_nistp256_points_mul,
+		ec_GFp_nistp256_precompute_mult,
+		ec_GFp_nistp256_have_precompute_mult,
+		ec_GFp_nist_field_mul,
+		ec_GFp_nist_field_sqr,
+		0 /* field_div */,
+		0 /* field_encode */,
+		0 /* field_decode */,
+		0 /* field_set_to_one */ };
+
+	return &ret;
+	}
+
+/******************************************************************************/
+/*		       FUNCTIONS TO MANAGE PRECOMPUTATION
+ */
+
+static NISTP256_PRE_COMP *nistp256_pre_comp_new()
+	{
+	NISTP256_PRE_COMP *ret = NULL;
+	ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret);
+	if (!ret)
+		{
+		ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+		return ret;
+		}
+	memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
+	ret->references = 1;
+	return ret;
+	}
+
+static void *nistp256_pre_comp_dup(void *src_)
+	{
+	NISTP256_PRE_COMP *src = src_;
+
+	/* no need to actually copy, these objects never change! */
+	CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+
+	return src_;
+	}
+
+static void nistp256_pre_comp_free(void *pre_)
+	{
+	int i;
+	NISTP256_PRE_COMP *pre = pre_;
+
+	if (!pre)
+		return;
+
+	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+	if (i > 0)
+		return;
+
+	OPENSSL_free(pre);
+	}
+
+static void nistp256_pre_comp_clear_free(void *pre_)
+	{
+	int i;
+	NISTP256_PRE_COMP *pre = pre_;
+
+	if (!pre)
+		return;
+
+	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+	if (i > 0)
+		return;
+
+	OPENSSL_cleanse(pre, sizeof *pre);
+	OPENSSL_free(pre);
+	}
+
+/******************************************************************************/
+/*			   OPENSSL EC_METHOD FUNCTIONS
+ */
+
+int ec_GFp_nistp256_group_init(EC_GROUP *group)
+	{
+	int ret;
+	ret = ec_GFp_simple_group_init(group);
+	group->a_is_minus3 = 1;
+	return ret;
+	}
+
+int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+	const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
+	{
+	int ret = 0;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *curve_p, *curve_a, *curve_b;
+
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
+		((curve_a = BN_CTX_get(ctx)) == NULL) ||
+		((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
+	BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p);
+	BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a);
+	BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b);
+	if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
+		(BN_cmp(curve_b, b)))
+		{
+		ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE,
+			EC_R_WRONG_CURVE_PARAMETERS);
+		goto err;
+		}
+	group->field_mod_func = BN_nist_mod_256;
+	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
+err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
+ * (X', Y') = (X/Z^2, Y/Z^3) */
+int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group,
+	const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+	{
+	felem z1, z2, x_in, y_in;
+	smallfelem x_out, y_out;
+	longfelem tmp;
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
+			EC_R_POINT_AT_INFINITY);
+		return 0;
+		}
+	if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
+		(!BN_to_felem(z1, &point->Z))) return 0;
+	felem_inv(z2, z1);
+	felem_square(tmp, z2); felem_reduce(z1, tmp);
+	felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
+	felem_contract(x_out, x_in);
+	if (x != NULL)
+		{
+		if (!smallfelem_to_BN(x, x_out)) {
+		ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
+			ERR_R_BN_LIB);
+		return 0;
+		}
+		}
+	felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
+	felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
+	felem_contract(y_out, y_in);
+	if (y != NULL)
+		{
+		if (!smallfelem_to_BN(y, y_out))
+			{
+			ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES,
+				ERR_R_BN_LIB);
+			return 0;
+			}
+		}
+	return 1;
+	}
+
+static void make_points_affine(size_t num, smallfelem points[/* num */][3], smallfelem tmp_smallfelems[/* num+1 */])
+	{
+	/* Runs in constant time, unless an input is the point at infinity
+	 * (which normally shouldn't happen). */
+	ec_GFp_nistp_points_make_affine_internal(
+		num,
+		points,
+		sizeof(smallfelem),
+		tmp_smallfelems,
+		(void (*)(void *)) smallfelem_one,
+		(int (*)(const void *)) smallfelem_is_zero_int,
+		(void (*)(void *, const void *)) smallfelem_assign,
+		(void (*)(void *, const void *)) smallfelem_square_contract,
+		(void (*)(void *, const void *, const void *)) smallfelem_mul_contract,
+		(void (*)(void *, const void *)) smallfelem_inv_contract,
+		(void (*)(void *, const void *)) smallfelem_assign /* nothing to contract */);
+	}
+
+/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
+ * Result is stored in r (r can equal one of the inputs). */
+int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r,
+	const BIGNUM *scalar, size_t num, const EC_POINT *points[],
+	const BIGNUM *scalars[], BN_CTX *ctx)
+	{
+	int ret = 0;
+	int j;
+	int mixed = 0;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y, *z, *tmp_scalar;
+	felem_bytearray g_secret;
+	felem_bytearray *secrets = NULL;
+	smallfelem (*pre_comp)[17][3] = NULL;
+	smallfelem *tmp_smallfelems = NULL;
+	felem_bytearray tmp;
+	unsigned i, num_bytes;
+	int have_pre_comp = 0;
+	size_t num_points = num;
+	smallfelem x_in, y_in, z_in;
+	felem x_out, y_out, z_out;
+	NISTP256_PRE_COMP *pre = NULL;
+	const smallfelem (*g_pre_comp)[16][3] = NULL;
+	EC_POINT *generator = NULL;
+	const EC_POINT *p = NULL;
+	const BIGNUM *p_scalar = NULL;
+
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((x = BN_CTX_get(ctx)) == NULL) ||
+		((y = BN_CTX_get(ctx)) == NULL) ||
+		((z = BN_CTX_get(ctx)) == NULL) ||
+		((tmp_scalar = BN_CTX_get(ctx)) == NULL))
+		goto err;
+
+	if (scalar != NULL)
+		{
+		pre = EC_EX_DATA_get_data(group->extra_data,
+			nistp256_pre_comp_dup, nistp256_pre_comp_free,
+			nistp256_pre_comp_clear_free);
+		if (pre)
+			/* we have precomputation, try to use it */
+			g_pre_comp = (const smallfelem (*)[16][3]) pre->g_pre_comp;
+		else
+			/* try to use the standard precomputation */
+			g_pre_comp = &gmul[0];
+		generator = EC_POINT_new(group);
+		if (generator == NULL)
+			goto err;
+		/* get the generator from precomputation */
+		if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) ||
+			!smallfelem_to_BN(y, g_pre_comp[0][1][1]) ||
+			!smallfelem_to_BN(z, g_pre_comp[0][1][2]))
+			{
+			ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+			goto err;
+			}
+		if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
+				generator, x, y, z, ctx))
+			goto err;
+		if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+			/* precomputation matches generator */
+			have_pre_comp = 1;
+		else
+			/* we don't have valid precomputation:
+			 * treat the generator as a random point */
+			num_points++;
+		}
+	if (num_points > 0)
+		{
+		if (num_points >= 3)
+			{
+			/* unless we precompute multiples for just one or two points,
+			 * converting those into affine form is time well spent  */
+			mixed = 1;
+			}
+		secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
+		pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem));
+		if (mixed)
+			tmp_smallfelems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem));
+		if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL)))
+			{
+			ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+
+		/* we treat NULL scalars as 0, and NULL points as points at infinity,
+		 * i.e., they contribute nothing to the linear combination */
+		memset(secrets, 0, num_points * sizeof(felem_bytearray));
+		memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem));
+		for (i = 0; i < num_points; ++i)
+			{
+			if (i == num)
+				/* we didn't have a valid precomputation, so we pick
+				 * the generator */
+				{
+				p = EC_GROUP_get0_generator(group);
+				p_scalar = scalar;
+				}
+			else
+				/* the i^th point */
+				{
+				p = points[i];
+				p_scalar = scalars[i];
+				}
+			if ((p_scalar != NULL) && (p != NULL))
+				{
+				/* reduce scalar to 0 <= scalar < 2^256 */
+				if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar)))
+					{
+					/* this is an unusual input, and we don't guarantee
+					 * constant-timeness */
+					if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
+						{
+						ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+						goto err;
+						}
+					num_bytes = BN_bn2bin(tmp_scalar, tmp);
+					}
+				else
+					num_bytes = BN_bn2bin(p_scalar, tmp);
+				flip_endian(secrets[i], tmp, num_bytes);
+				/* precompute multiples */
+				if ((!BN_to_felem(x_out, &p->X)) ||
+					(!BN_to_felem(y_out, &p->Y)) ||
+					(!BN_to_felem(z_out, &p->Z))) goto err;
+				felem_shrink(pre_comp[i][1][0], x_out);
+				felem_shrink(pre_comp[i][1][1], y_out);
+				felem_shrink(pre_comp[i][1][2], z_out);
+				for (j = 2; j <= 16; ++j)
+					{
+					if (j & 1)
+						{
+						point_add_small(
+							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+							pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
+							pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
+						}
+					else
+						{
+						point_double_small(
+							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+							pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
+						}
+					}
+				}
+			}
+		if (mixed)
+			make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems);
+		}
+
+	/* the scalar for the generator */
+	if ((scalar != NULL) && (have_pre_comp))
+		{
+		memset(g_secret, 0, sizeof(g_secret));
+		/* reduce scalar to 0 <= scalar < 2^256 */
+		if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar)))
+			{
+			/* this is an unusual input, and we don't guarantee
+			 * constant-timeness */
+			if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
+				{
+				ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+				goto err;
+				}
+			num_bytes = BN_bn2bin(tmp_scalar, tmp);
+			}
+		else
+			num_bytes = BN_bn2bin(scalar, tmp);
+		flip_endian(g_secret, tmp, num_bytes);
+		/* do the multiplication with generator precomputation*/
+		batch_mul(x_out, y_out, z_out,
+			(const felem_bytearray (*)) secrets, num_points,
+			g_secret,
+			mixed, (const smallfelem (*)[17][3]) pre_comp,
+			g_pre_comp);
+		}
+	else
+		/* do the multiplication without generator precomputation */
+		batch_mul(x_out, y_out, z_out,
+			(const felem_bytearray (*)) secrets, num_points,
+			NULL, mixed, (const smallfelem (*)[17][3]) pre_comp, NULL);
+	/* reduce the output to its unique minimal representation */
+	felem_contract(x_in, x_out);
+	felem_contract(y_in, y_out);
+	felem_contract(z_in, z_out);
+	if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) ||
+		(!smallfelem_to_BN(z, z_in)))
+		{
+		ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB);
+		goto err;
+		}
+	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
+
+err:
+	BN_CTX_end(ctx);
+	if (generator != NULL)
+		EC_POINT_free(generator);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	if (secrets != NULL)
+		OPENSSL_free(secrets);
+	if (pre_comp != NULL)
+		OPENSSL_free(pre_comp);
+	if (tmp_smallfelems != NULL)
+		OPENSSL_free(tmp_smallfelems);
+	return ret;
+	}
+
+int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
+	{
+	int ret = 0;
+	NISTP256_PRE_COMP *pre = NULL;
+	int i, j;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y;
+	EC_POINT *generator = NULL;
+	smallfelem tmp_smallfelems[32];
+	felem x_tmp, y_tmp, z_tmp;
+
+	/* throw away old precomputation */
+	EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup,
+		nistp256_pre_comp_free, nistp256_pre_comp_clear_free);
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((x = BN_CTX_get(ctx)) == NULL) ||
+		((y = BN_CTX_get(ctx)) == NULL))
+		goto err;
+	/* get the generator */
+	if (group->generator == NULL) goto err;
+	generator = EC_POINT_new(group);
+	if (generator == NULL)
+		goto err;
+	BN_bin2bn(nistp256_curve_params[3], sizeof (felem_bytearray), x);
+	BN_bin2bn(nistp256_curve_params[4], sizeof (felem_bytearray), y);
+	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
+		goto err;
+	if ((pre = nistp256_pre_comp_new()) == NULL)
+		goto err;
+	/* if the generator is the standard one, use built-in precomputation */
+	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+		{
+		memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
+		ret = 1;
+		goto err;
+		}
+	if ((!BN_to_felem(x_tmp, &group->generator->X)) ||
+		(!BN_to_felem(y_tmp, &group->generator->Y)) ||
+		(!BN_to_felem(z_tmp, &group->generator->Z)))
+		goto err;
+	felem_shrink(pre->g_pre_comp[0][1][0], x_tmp);
+	felem_shrink(pre->g_pre_comp[0][1][1], y_tmp);
+	felem_shrink(pre->g_pre_comp[0][1][2], z_tmp);
+	/* compute 2^64*G, 2^128*G, 2^192*G for the first table,
+	 * 2^32*G, 2^96*G, 2^160*G, 2^224*G for the second one
+	 */
+	for (i = 1; i <= 8; i <<= 1)
+		{
+		point_double_small(
+			pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+			pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]);
+		for (j = 0; j < 31; ++j)
+			{
+			point_double_small(
+				pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2],
+				pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+			}
+		if (i == 8)
+			break;
+		point_double_small(
+			pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+			pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]);
+		for (j = 0; j < 31; ++j)
+			{
+			point_double_small(
+				pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2],
+				pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]);
+			}
+		}
+	for (i = 0; i < 2; i++)
+		{
+		/* g_pre_comp[i][0] is the point at infinity */
+		memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0]));
+		/* the remaining multiples */
+		/* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */
+		point_add_small(
+			pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2],
+			pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2],
+			pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
+		/* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */
+		point_add_small(
+			pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2],
+			pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+			pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
+		/* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */
+		point_add_small(
+			pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
+			pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2],
+			pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]);
+		/* 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G */
+		point_add_small(
+			pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2],
+			pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2],
+			pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]);
+		for (j = 1; j < 8; ++j)
+			{
+			/* odd multiples: add G resp. 2^32*G */
+			point_add_small(
+				pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], pre->g_pre_comp[i][2*j+1][2],
+				pre->g_pre_comp[i][2*j][0], pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2],
+				pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]);
+			}
+		}
+	make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems);
+
+	if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup,
+			nistp256_pre_comp_free, nistp256_pre_comp_clear_free))
+		goto err;
+	ret = 1;
+	pre = NULL;
+ err:
+	BN_CTX_end(ctx);
+	if (generator != NULL)
+		EC_POINT_free(generator);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	if (pre)
+		nistp256_pre_comp_free(pre);
+	return ret;
+	}
+
+int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group)
+	{
+	if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup,
+			nistp256_pre_comp_free, nistp256_pre_comp_clear_free)
+		!= NULL)
+		return 1;
+	else
+		return 0;
+	}
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_nistp521.c b/src/lib/libcrypto/ec/ecp_nistp521.c
new file mode 100644
index 0000000000..178b655f7f
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistp521.c
@@ -0,0 +1,2025 @@
+/* crypto/ec/ecp_nistp521.c */
+/*
+ * Written by Adam Langley (Google) for the OpenSSL project
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*
+ * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
+ *
+ * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
+ * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
+ * work which got its smarts from Daniel J. Bernstein's work on the same.
+ */
+
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+
+#ifndef OPENSSL_SYS_VMS
+#include <stdint.h>
+#else
+#include <inttypes.h>
+#endif
+
+#include <string.h>
+#include <openssl/err.h>
+#include "ec_lcl.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
+  /* even with gcc, the typedef won't work for 32-bit platforms */
+  typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
+#else
+  #error "Need GCC 3.1 or later to define type uint128_t"
+#endif
+
+typedef uint8_t u8;
+typedef uint64_t u64;
+typedef int64_t s64;
+
+/* The underlying field.
+ *
+ * P521 operates over GF(2^521-1). We can serialise an element of this field
+ * into 66 bytes where the most significant byte contains only a single bit. We
+ * call this an felem_bytearray. */
+
+typedef u8 felem_bytearray[66];
+
+/* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
+ * These values are big-endian. */
+static const felem_bytearray nistp521_curve_params[5] =
+	{
+	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff},
+	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	 0xff, 0xfc},
+	{0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
+	 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
+	 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
+	 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
+	 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
+	 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
+	 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
+	 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
+	 0x3f, 0x00},
+	{0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
+	 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
+	 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
+	 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
+	 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
+	 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
+	 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
+	 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
+	 0xbd, 0x66},
+	{0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
+	 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
+	 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
+	 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
+	 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
+	 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
+	 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
+	 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
+	 0x66, 0x50}
+	};
+
+/* The representation of field elements.
+ * ------------------------------------
+ *
+ * We represent field elements with nine values. These values are either 64 or
+ * 128 bits and the field element represented is:
+ *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
+ * Each of the nine values is called a 'limb'. Since the limbs are spaced only
+ * 58 bits apart, but are greater than 58 bits in length, the most significant
+ * bits of each limb overlap with the least significant bits of the next.
+ *
+ * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
+ * 'largefelem' */
+
+#define NLIMBS 9
+
+typedef uint64_t limb;
+typedef limb felem[NLIMBS];
+typedef uint128_t largefelem[NLIMBS];
+
+static const limb bottom57bits = 0x1ffffffffffffff;
+static const limb bottom58bits = 0x3ffffffffffffff;
+
+/* bin66_to_felem takes a little-endian byte array and converts it into felem
+ * form. This assumes that the CPU is little-endian. */
+static void bin66_to_felem(felem out, const u8 in[66])
+	{
+	out[0] = (*((limb*) &in[0])) & bottom58bits;
+	out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
+	out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
+	out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
+	out[4] = (*((limb*) &in[29])) & bottom58bits;
+	out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
+	out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
+	out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
+	out[8] = (*((limb*) &in[58])) & bottom57bits;
+	}
+
+/* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
+ * array. This assumes that the CPU is little-endian. */
+static void felem_to_bin66(u8 out[66], const felem in)
+	{
+	memset(out, 0, 66);
+	(*((limb*) &out[0])) = in[0];
+	(*((limb*) &out[7])) |= in[1] << 2;
+	(*((limb*) &out[14])) |= in[2] << 4;
+	(*((limb*) &out[21])) |= in[3] << 6;
+	(*((limb*) &out[29])) = in[4];
+	(*((limb*) &out[36])) |= in[5] << 2;
+	(*((limb*) &out[43])) |= in[6] << 4;
+	(*((limb*) &out[50])) |= in[7] << 6;
+	(*((limb*) &out[58])) = in[8];
+	}
+
+/* To preserve endianness when using BN_bn2bin and BN_bin2bn */
+static void flip_endian(u8 *out, const u8 *in, unsigned len)
+	{
+	unsigned i;
+	for (i = 0; i < len; ++i)
+		out[i] = in[len-1-i];
+	}
+
+/* BN_to_felem converts an OpenSSL BIGNUM into an felem */
+static int BN_to_felem(felem out, const BIGNUM *bn)
+	{
+	felem_bytearray b_in;
+	felem_bytearray b_out;
+	unsigned num_bytes;
+
+	/* BN_bn2bin eats leading zeroes */
+	memset(b_out, 0, sizeof b_out);
+	num_bytes = BN_num_bytes(bn);
+	if (num_bytes > sizeof b_out)
+		{
+		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+		return 0;
+		}
+	if (BN_is_negative(bn))
+		{
+		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
+		return 0;
+		}
+	num_bytes = BN_bn2bin(bn, b_in);
+	flip_endian(b_out, b_in, num_bytes);
+	bin66_to_felem(out, b_out);
+	return 1;
+	}
+
+/* felem_to_BN converts an felem into an OpenSSL BIGNUM */
+static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
+	{
+	felem_bytearray b_in, b_out;
+	felem_to_bin66(b_in, in);
+	flip_endian(b_out, b_in, sizeof b_out);
+	return BN_bin2bn(b_out, sizeof b_out, out);
+	}
+
+
+/* Field operations
+ * ---------------- */
+
+static void felem_one(felem out)
+	{
+	out[0] = 1;
+	out[1] = 0;
+	out[2] = 0;
+	out[3] = 0;
+	out[4] = 0;
+	out[5] = 0;
+	out[6] = 0;
+	out[7] = 0;
+	out[8] = 0;
+	}
+
+static void felem_assign(felem out, const felem in)
+	{
+	out[0] = in[0];
+	out[1] = in[1];
+	out[2] = in[2];
+	out[3] = in[3];
+	out[4] = in[4];
+	out[5] = in[5];
+	out[6] = in[6];
+	out[7] = in[7];
+	out[8] = in[8];
+	}
+
+/* felem_sum64 sets out = out + in. */
+static void felem_sum64(felem out, const felem in)
+	{
+	out[0] += in[0];
+	out[1] += in[1];
+	out[2] += in[2];
+	out[3] += in[3];
+	out[4] += in[4];
+	out[5] += in[5];
+	out[6] += in[6];
+	out[7] += in[7];
+	out[8] += in[8];
+	}
+
+/* felem_scalar sets out = in * scalar */
+static void felem_scalar(felem out, const felem in, limb scalar)
+	{
+	out[0] = in[0] * scalar;
+	out[1] = in[1] * scalar;
+	out[2] = in[2] * scalar;
+	out[3] = in[3] * scalar;
+	out[4] = in[4] * scalar;
+	out[5] = in[5] * scalar;
+	out[6] = in[6] * scalar;
+	out[7] = in[7] * scalar;
+	out[8] = in[8] * scalar;
+	}
+
+/* felem_scalar64 sets out = out * scalar */
+static void felem_scalar64(felem out, limb scalar)
+	{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+	out[4] *= scalar;
+	out[5] *= scalar;
+	out[6] *= scalar;
+	out[7] *= scalar;
+	out[8] *= scalar;
+	}
+
+/* felem_scalar128 sets out = out * scalar */
+static void felem_scalar128(largefelem out, limb scalar)
+	{
+	out[0] *= scalar;
+	out[1] *= scalar;
+	out[2] *= scalar;
+	out[3] *= scalar;
+	out[4] *= scalar;
+	out[5] *= scalar;
+	out[6] *= scalar;
+	out[7] *= scalar;
+	out[8] *= scalar;
+	}
+
+/* felem_neg sets |out| to |-in|
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ * On exit:
+ *   out[i] < 2^62
+ */
+static void felem_neg(felem out, const felem in)
+	{
+	/* In order to prevent underflow, we subtract from 0 mod p. */
+	static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
+	static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
+
+	out[0] = two62m3 - in[0];
+	out[1] = two62m2 - in[1];
+	out[2] = two62m2 - in[2];
+	out[3] = two62m2 - in[3];
+	out[4] = two62m2 - in[4];
+	out[5] = two62m2 - in[5];
+	out[6] = two62m2 - in[6];
+	out[7] = two62m2 - in[7];
+	out[8] = two62m2 - in[8];
+	}
+
+/* felem_diff64 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ * On exit:
+ *   out[i] < out[i] + 2^62
+ */
+static void felem_diff64(felem out, const felem in)
+	{
+	/* In order to prevent underflow, we add 0 mod p before subtracting. */
+	static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
+	static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
+
+	out[0] += two62m3 - in[0];
+	out[1] += two62m2 - in[1];
+	out[2] += two62m2 - in[2];
+	out[3] += two62m2 - in[3];
+	out[4] += two62m2 - in[4];
+	out[5] += two62m2 - in[5];
+	out[6] += two62m2 - in[6];
+	out[7] += two62m2 - in[7];
+	out[8] += two62m2 - in[8];
+	}
+
+/* felem_diff_128_64 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^62 + 2^17
+ * On exit:
+ *   out[i] < out[i] + 2^63
+ */
+static void felem_diff_128_64(largefelem out, const felem in)
+	{
+	/* In order to prevent underflow, we add 0 mod p before subtracting. */
+	static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
+	static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
+
+	out[0] += two63m6 - in[0];
+	out[1] += two63m5 - in[1];
+	out[2] += two63m5 - in[2];
+	out[3] += two63m5 - in[3];
+	out[4] += two63m5 - in[4];
+	out[5] += two63m5 - in[5];
+	out[6] += two63m5 - in[6];
+	out[7] += two63m5 - in[7];
+	out[8] += two63m5 - in[8];
+	}
+
+/* felem_diff_128_64 subtracts |in| from |out|
+ * On entry:
+ *   in[i] < 2^126
+ * On exit:
+ *   out[i] < out[i] + 2^127 - 2^69
+ */
+static void felem_diff128(largefelem out, const largefelem in)
+	{
+	/* In order to prevent underflow, we add 0 mod p before subtracting. */
+	static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
+	static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
+
+	out[0] += (two127m70 - in[0]);
+	out[1] += (two127m69 - in[1]);
+	out[2] += (two127m69 - in[2]);
+	out[3] += (two127m69 - in[3]);
+	out[4] += (two127m69 - in[4]);
+	out[5] += (two127m69 - in[5]);
+	out[6] += (two127m69 - in[6]);
+	out[7] += (two127m69 - in[7]);
+	out[8] += (two127m69 - in[8]);
+	}
+
+/* felem_square sets |out| = |in|^2
+ * On entry:
+ *   in[i] < 2^62
+ * On exit:
+ *   out[i] < 17 * max(in[i]) * max(in[i])
+ */
+static void felem_square(largefelem out, const felem in)
+	{
+	felem inx2, inx4;
+	felem_scalar(inx2, in, 2);
+	felem_scalar(inx4, in, 4);
+
+	/* We have many cases were we want to do
+	 *   in[x] * in[y] +
+	 *   in[y] * in[x]
+	 * This is obviously just
+	 *   2 * in[x] * in[y]
+	 * However, rather than do the doubling on the 128 bit result, we
+	 * double one of the inputs to the multiplication by reading from
+	 * |inx2| */
+
+	out[0] = ((uint128_t) in[0]) * in[0];
+	out[1] = ((uint128_t) in[0]) * inx2[1];
+	out[2] = ((uint128_t) in[0]) * inx2[2] +
+		 ((uint128_t) in[1]) * in[1];
+	out[3] = ((uint128_t) in[0]) * inx2[3] +
+		 ((uint128_t) in[1]) * inx2[2];
+	out[4] = ((uint128_t) in[0]) * inx2[4] +
+		 ((uint128_t) in[1]) * inx2[3] +
+		 ((uint128_t) in[2]) * in[2];
+	out[5] = ((uint128_t) in[0]) * inx2[5] +
+		 ((uint128_t) in[1]) * inx2[4] +
+		 ((uint128_t) in[2]) * inx2[3];
+	out[6] = ((uint128_t) in[0]) * inx2[6] +
+		 ((uint128_t) in[1]) * inx2[5] +
+		 ((uint128_t) in[2]) * inx2[4] +
+		 ((uint128_t) in[3]) * in[3];
+	out[7] = ((uint128_t) in[0]) * inx2[7] +
+		 ((uint128_t) in[1]) * inx2[6] +
+		 ((uint128_t) in[2]) * inx2[5] +
+		 ((uint128_t) in[3]) * inx2[4];
+	out[8] = ((uint128_t) in[0]) * inx2[8] +
+		 ((uint128_t) in[1]) * inx2[7] +
+		 ((uint128_t) in[2]) * inx2[6] +
+		 ((uint128_t) in[3]) * inx2[5] +
+		 ((uint128_t) in[4]) * in[4];
+
+	/* The remaining limbs fall above 2^521, with the first falling at
+	 * 2^522. They correspond to locations one bit up from the limbs
+	 * produced above so we would have to multiply by two to align them.
+	 * Again, rather than operate on the 128-bit result, we double one of
+	 * the inputs to the multiplication. If we want to double for both this
+	 * reason, and the reason above, then we end up multiplying by four. */
+
+	/* 9 */
+	out[0] += ((uint128_t) in[1]) * inx4[8] +
+		  ((uint128_t) in[2]) * inx4[7] +
+		  ((uint128_t) in[3]) * inx4[6] +
+		  ((uint128_t) in[4]) * inx4[5];
+
+	/* 10 */
+	out[1] += ((uint128_t) in[2]) * inx4[8] +
+		  ((uint128_t) in[3]) * inx4[7] +
+		  ((uint128_t) in[4]) * inx4[6] +
+		  ((uint128_t) in[5]) * inx2[5];
+
+	/* 11 */
+	out[2] += ((uint128_t) in[3]) * inx4[8] +
+		  ((uint128_t) in[4]) * inx4[7] +
+		  ((uint128_t) in[5]) * inx4[6];
+
+	/* 12 */
+	out[3] += ((uint128_t) in[4]) * inx4[8] +
+		  ((uint128_t) in[5]) * inx4[7] +
+		  ((uint128_t) in[6]) * inx2[6];
+
+	/* 13 */
+	out[4] += ((uint128_t) in[5]) * inx4[8] +
+		  ((uint128_t) in[6]) * inx4[7];
+
+	/* 14 */
+	out[5] += ((uint128_t) in[6]) * inx4[8] +
+		  ((uint128_t) in[7]) * inx2[7];
+
+	/* 15 */
+	out[6] += ((uint128_t) in[7]) * inx4[8];
+
+	/* 16 */
+	out[7] += ((uint128_t) in[8]) * inx2[8];
+	}
+
+/* felem_mul sets |out| = |in1| * |in2|
+ * On entry:
+ *   in1[i] < 2^64
+ *   in2[i] < 2^63
+ * On exit:
+ *   out[i] < 17 * max(in1[i]) * max(in2[i])
+ */
+static void felem_mul(largefelem out, const felem in1, const felem in2)
+	{
+	felem in2x2;
+	felem_scalar(in2x2, in2, 2);
+
+	out[0] = ((uint128_t) in1[0]) * in2[0];
+
+	out[1] = ((uint128_t) in1[0]) * in2[1] +
+	         ((uint128_t) in1[1]) * in2[0];
+
+	out[2] = ((uint128_t) in1[0]) * in2[2] +
+		 ((uint128_t) in1[1]) * in2[1] +
+	         ((uint128_t) in1[2]) * in2[0];
+
+	out[3] = ((uint128_t) in1[0]) * in2[3] +
+		 ((uint128_t) in1[1]) * in2[2] +
+		 ((uint128_t) in1[2]) * in2[1] +
+		 ((uint128_t) in1[3]) * in2[0];
+
+	out[4] = ((uint128_t) in1[0]) * in2[4] +
+		 ((uint128_t) in1[1]) * in2[3] +
+		 ((uint128_t) in1[2]) * in2[2] +
+		 ((uint128_t) in1[3]) * in2[1] +
+		 ((uint128_t) in1[4]) * in2[0];
+
+	out[5] = ((uint128_t) in1[0]) * in2[5] +
+		 ((uint128_t) in1[1]) * in2[4] +
+		 ((uint128_t) in1[2]) * in2[3] +
+		 ((uint128_t) in1[3]) * in2[2] +
+		 ((uint128_t) in1[4]) * in2[1] +
+		 ((uint128_t) in1[5]) * in2[0];
+
+	out[6] = ((uint128_t) in1[0]) * in2[6] +
+		 ((uint128_t) in1[1]) * in2[5] +
+		 ((uint128_t) in1[2]) * in2[4] +
+		 ((uint128_t) in1[3]) * in2[3] +
+		 ((uint128_t) in1[4]) * in2[2] +
+		 ((uint128_t) in1[5]) * in2[1] +
+		 ((uint128_t) in1[6]) * in2[0];
+
+	out[7] = ((uint128_t) in1[0]) * in2[7] +
+		 ((uint128_t) in1[1]) * in2[6] +
+		 ((uint128_t) in1[2]) * in2[5] +
+		 ((uint128_t) in1[3]) * in2[4] +
+		 ((uint128_t) in1[4]) * in2[3] +
+		 ((uint128_t) in1[5]) * in2[2] +
+		 ((uint128_t) in1[6]) * in2[1] +
+		 ((uint128_t) in1[7]) * in2[0];
+
+	out[8] = ((uint128_t) in1[0]) * in2[8] +
+		 ((uint128_t) in1[1]) * in2[7] +
+		 ((uint128_t) in1[2]) * in2[6] +
+		 ((uint128_t) in1[3]) * in2[5] +
+		 ((uint128_t) in1[4]) * in2[4] +
+		 ((uint128_t) in1[5]) * in2[3] +
+		 ((uint128_t) in1[6]) * in2[2] +
+		 ((uint128_t) in1[7]) * in2[1] +
+		 ((uint128_t) in1[8]) * in2[0];
+
+	/* See comment in felem_square about the use of in2x2 here */
+
+	out[0] += ((uint128_t) in1[1]) * in2x2[8] +
+		  ((uint128_t) in1[2]) * in2x2[7] +
+		  ((uint128_t) in1[3]) * in2x2[6] +
+		  ((uint128_t) in1[4]) * in2x2[5] +
+		  ((uint128_t) in1[5]) * in2x2[4] +
+		  ((uint128_t) in1[6]) * in2x2[3] +
+		  ((uint128_t) in1[7]) * in2x2[2] +
+		  ((uint128_t) in1[8]) * in2x2[1];
+
+	out[1] += ((uint128_t) in1[2]) * in2x2[8] +
+		  ((uint128_t) in1[3]) * in2x2[7] +
+		  ((uint128_t) in1[4]) * in2x2[6] +
+		  ((uint128_t) in1[5]) * in2x2[5] +
+		  ((uint128_t) in1[6]) * in2x2[4] +
+		  ((uint128_t) in1[7]) * in2x2[3] +
+		  ((uint128_t) in1[8]) * in2x2[2];
+
+	out[2] += ((uint128_t) in1[3]) * in2x2[8] +
+		  ((uint128_t) in1[4]) * in2x2[7] +
+		  ((uint128_t) in1[5]) * in2x2[6] +
+		  ((uint128_t) in1[6]) * in2x2[5] +
+		  ((uint128_t) in1[7]) * in2x2[4] +
+		  ((uint128_t) in1[8]) * in2x2[3];
+
+	out[3] += ((uint128_t) in1[4]) * in2x2[8] +
+		  ((uint128_t) in1[5]) * in2x2[7] +
+		  ((uint128_t) in1[6]) * in2x2[6] +
+		  ((uint128_t) in1[7]) * in2x2[5] +
+		  ((uint128_t) in1[8]) * in2x2[4];
+
+	out[4] += ((uint128_t) in1[5]) * in2x2[8] +
+		  ((uint128_t) in1[6]) * in2x2[7] +
+		  ((uint128_t) in1[7]) * in2x2[6] +
+		  ((uint128_t) in1[8]) * in2x2[5];
+
+	out[5] += ((uint128_t) in1[6]) * in2x2[8] +
+		  ((uint128_t) in1[7]) * in2x2[7] +
+		  ((uint128_t) in1[8]) * in2x2[6];
+
+	out[6] += ((uint128_t) in1[7]) * in2x2[8] +
+		  ((uint128_t) in1[8]) * in2x2[7];
+
+	out[7] += ((uint128_t) in1[8]) * in2x2[8];
+	}
+
+static const limb bottom52bits = 0xfffffffffffff;
+
+/* felem_reduce converts a largefelem to an felem.
+ * On entry:
+ *   in[i] < 2^128
+ * On exit:
+ *   out[i] < 2^59 + 2^14
+ */
+static void felem_reduce(felem out, const largefelem in)
+	{
+	u64 overflow1, overflow2;
+
+	out[0] = ((limb) in[0]) & bottom58bits;
+	out[1] = ((limb) in[1]) & bottom58bits;
+	out[2] = ((limb) in[2]) & bottom58bits;
+	out[3] = ((limb) in[3]) & bottom58bits;
+	out[4] = ((limb) in[4]) & bottom58bits;
+	out[5] = ((limb) in[5]) & bottom58bits;
+	out[6] = ((limb) in[6]) & bottom58bits;
+	out[7] = ((limb) in[7]) & bottom58bits;
+	out[8] = ((limb) in[8]) & bottom58bits;
+
+	/* out[i] < 2^58 */
+
+	out[1] += ((limb) in[0]) >> 58;
+	out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
+	/* out[1] < 2^58 + 2^6 + 2^58
+	 *        = 2^59 + 2^6 */
+	out[2] += ((limb) (in[0] >> 64)) >> 52;
+
+	out[2] += ((limb) in[1]) >> 58;
+	out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
+	out[3] += ((limb) (in[1] >> 64)) >> 52;
+
+	out[3] += ((limb) in[2]) >> 58;
+	out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
+	out[4] += ((limb) (in[2] >> 64)) >> 52;
+
+	out[4] += ((limb) in[3]) >> 58;
+	out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
+	out[5] += ((limb) (in[3] >> 64)) >> 52;
+
+	out[5] += ((limb) in[4]) >> 58;
+	out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
+	out[6] += ((limb) (in[4] >> 64)) >> 52;
+
+	out[6] += ((limb) in[5]) >> 58;
+	out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
+	out[7] += ((limb) (in[5] >> 64)) >> 52;
+
+	out[7] += ((limb) in[6]) >> 58;
+	out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
+	out[8] += ((limb) (in[6] >> 64)) >> 52;
+
+	out[8] += ((limb) in[7]) >> 58;
+	out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
+	/* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
+	 *            < 2^59 + 2^13 */
+	overflow1 = ((limb) (in[7] >> 64)) >> 52;
+
+	overflow1 += ((limb) in[8]) >> 58;
+	overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
+	overflow2 = ((limb) (in[8] >> 64)) >> 52;
+
+	overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
+	overflow2 <<= 1;  /* overflow2 < 2^13 */
+
+	out[0] += overflow1;  /* out[0] < 2^60 */
+	out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
+
+	out[1] += out[0] >> 58; out[0] &= bottom58bits;
+	/* out[0] < 2^58
+	 * out[1] < 2^59 + 2^6 + 2^13 + 2^2
+	 *        < 2^59 + 2^14 */
+	}
+
+static void felem_square_reduce(felem out, const felem in)
+	{
+	largefelem tmp;
+	felem_square(tmp, in);
+	felem_reduce(out, tmp);
+	}
+
+static void felem_mul_reduce(felem out, const felem in1, const felem in2)
+	{
+	largefelem tmp;
+	felem_mul(tmp, in1, in2);
+	felem_reduce(out, tmp);
+	}
+
+/* felem_inv calculates |out| = |in|^{-1}
+ *
+ * Based on Fermat's Little Theorem:
+ *   a^p = a (mod p)
+ *   a^{p-1} = 1 (mod p)
+ *   a^{p-2} = a^{-1} (mod p)
+ */
+static void felem_inv(felem out, const felem in)
+	{
+	felem ftmp, ftmp2, ftmp3, ftmp4;
+	largefelem tmp;
+	unsigned i;
+
+	felem_square(tmp, in); felem_reduce(ftmp, tmp);		/* 2^1 */
+	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^2 - 2^0 */
+	felem_assign(ftmp2, ftmp);
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 2^1 */
+	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 2^0 */
+	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^4 - 2^1 */
+
+	felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^3 - 2^1 */
+	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^4 - 2^2 */
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^4 - 2^0 */
+
+	felem_assign(ftmp2, ftmp3);
+	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^5 - 2^1 */
+	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^6 - 2^2 */
+	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^7 - 2^3 */
+	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^8 - 2^4 */
+	felem_assign(ftmp4, ftmp3);
+	felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);	/* 2^8 - 2^1 */
+	felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);	/* 2^9 - 2^2 */
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^8 - 2^0 */
+	felem_assign(ftmp2, ftmp3);
+
+	for (i = 0; i < 8; i++)
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^16 - 2^8 */
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^16 - 2^0 */
+	felem_assign(ftmp2, ftmp3);
+
+	for (i = 0; i < 16; i++)
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^32 - 2^16 */
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^32 - 2^0 */
+	felem_assign(ftmp2, ftmp3);
+
+	for (i = 0; i < 32; i++)
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^64 - 2^32 */
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^64 - 2^0 */
+	felem_assign(ftmp2, ftmp3);
+
+	for (i = 0; i < 64; i++)
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^128 - 2^64 */
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^128 - 2^0 */
+	felem_assign(ftmp2, ftmp3);
+
+	for (i = 0; i < 128; i++)
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^256 - 2^128 */
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^256 - 2^0 */
+	felem_assign(ftmp2, ftmp3);
+
+	for (i = 0; i < 256; i++)
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^256 */
+		}
+	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^0 */
+
+	for (i = 0; i < 9; i++)
+		{
+		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^521 - 2^9 */
+		}
+	felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^2 */
+	felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);	/* 2^512 - 3 */
+}
+
+/* This is 2^521-1, expressed as an felem */
+static const felem kPrime =
+	{
+	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
+	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
+	0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
+	};
+
+/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
+ * otherwise.
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ */
+static limb felem_is_zero(const felem in)
+	{
+	felem ftmp;
+	limb is_zero, is_p;
+	felem_assign(ftmp, in);
+
+	ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
+	/* ftmp[8] < 2^57 */
+	ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
+	ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
+	ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
+	ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
+	ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
+	ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
+	ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
+	ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
+	/* ftmp[8] < 2^57 + 4 */
+
+	/* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
+	 * greater than our bound for ftmp[8]. Therefore we only have to check
+	 * if the zero is zero or 2^521-1. */
+
+	is_zero = 0;
+	is_zero |= ftmp[0];
+	is_zero |= ftmp[1];
+	is_zero |= ftmp[2];
+	is_zero |= ftmp[3];
+	is_zero |= ftmp[4];
+	is_zero |= ftmp[5];
+	is_zero |= ftmp[6];
+	is_zero |= ftmp[7];
+	is_zero |= ftmp[8];
+
+	is_zero--;
+	/* We know that ftmp[i] < 2^63, therefore the only way that the top bit
+	 * can be set is if is_zero was 0 before the decrement. */
+	is_zero = ((s64) is_zero) >> 63;
+
+	is_p = ftmp[0] ^ kPrime[0];
+	is_p |= ftmp[1] ^ kPrime[1];
+	is_p |= ftmp[2] ^ kPrime[2];
+	is_p |= ftmp[3] ^ kPrime[3];
+	is_p |= ftmp[4] ^ kPrime[4];
+	is_p |= ftmp[5] ^ kPrime[5];
+	is_p |= ftmp[6] ^ kPrime[6];
+	is_p |= ftmp[7] ^ kPrime[7];
+	is_p |= ftmp[8] ^ kPrime[8];
+
+	is_p--;
+	is_p = ((s64) is_p) >> 63;
+
+	is_zero |= is_p;
+	return is_zero;
+	}
+
+static int felem_is_zero_int(const felem in)
+	{
+	return (int) (felem_is_zero(in) & ((limb)1));
+	}
+
+/* felem_contract converts |in| to its unique, minimal representation.
+ * On entry:
+ *   in[i] < 2^59 + 2^14
+ */
+static void felem_contract(felem out, const felem in)
+	{
+	limb is_p, is_greater, sign;
+	static const limb two58 = ((limb)1) << 58;
+
+	felem_assign(out, in);
+
+	out[0] += out[8] >> 57; out[8] &= bottom57bits;
+	/* out[8] < 2^57 */
+	out[1] += out[0] >> 58; out[0] &= bottom58bits;
+	out[2] += out[1] >> 58; out[1] &= bottom58bits;
+	out[3] += out[2] >> 58; out[2] &= bottom58bits;
+	out[4] += out[3] >> 58; out[3] &= bottom58bits;
+	out[5] += out[4] >> 58; out[4] &= bottom58bits;
+	out[6] += out[5] >> 58; out[5] &= bottom58bits;
+	out[7] += out[6] >> 58; out[6] &= bottom58bits;
+	out[8] += out[7] >> 58; out[7] &= bottom58bits;
+	/* out[8] < 2^57 + 4 */
+
+	/* If the value is greater than 2^521-1 then we have to subtract
+	 * 2^521-1 out. See the comments in felem_is_zero regarding why we
+	 * don't test for other multiples of the prime. */
+
+	/* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
+
+	is_p = out[0] ^ kPrime[0];
+	is_p |= out[1] ^ kPrime[1];
+	is_p |= out[2] ^ kPrime[2];
+	is_p |= out[3] ^ kPrime[3];
+	is_p |= out[4] ^ kPrime[4];
+	is_p |= out[5] ^ kPrime[5];
+	is_p |= out[6] ^ kPrime[6];
+	is_p |= out[7] ^ kPrime[7];
+	is_p |= out[8] ^ kPrime[8];
+
+	is_p--;
+	is_p &= is_p << 32;
+	is_p &= is_p << 16;
+	is_p &= is_p << 8;
+	is_p &= is_p << 4;
+	is_p &= is_p << 2;
+	is_p &= is_p << 1;
+	is_p = ((s64) is_p) >> 63;
+	is_p = ~is_p;
+
+	/* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
+
+	out[0] &= is_p;
+	out[1] &= is_p;
+	out[2] &= is_p;
+	out[3] &= is_p;
+	out[4] &= is_p;
+	out[5] &= is_p;
+	out[6] &= is_p;
+	out[7] &= is_p;
+	out[8] &= is_p;
+
+	/* In order to test that |out| >= 2^521-1 we need only test if out[8]
+	 * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
+	is_greater = out[8] >> 57;
+	is_greater |= is_greater << 32;
+	is_greater |= is_greater << 16;
+	is_greater |= is_greater << 8;
+	is_greater |= is_greater << 4;
+	is_greater |= is_greater << 2;
+	is_greater |= is_greater << 1;
+	is_greater = ((s64) is_greater) >> 63;
+
+	out[0] -= kPrime[0] & is_greater;
+	out[1] -= kPrime[1] & is_greater;
+	out[2] -= kPrime[2] & is_greater;
+	out[3] -= kPrime[3] & is_greater;
+	out[4] -= kPrime[4] & is_greater;
+	out[5] -= kPrime[5] & is_greater;
+	out[6] -= kPrime[6] & is_greater;
+	out[7] -= kPrime[7] & is_greater;
+	out[8] -= kPrime[8] & is_greater;
+
+	/* Eliminate negative coefficients */
+	sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
+	sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
+	sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
+	sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
+	sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
+	sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
+	sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
+	sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
+	sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
+	sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
+	sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
+	}
+
+/* Group operations
+ * ----------------
+ *
+ * Building on top of the field operations we have the operations on the
+ * elliptic curve group itself. Points on the curve are represented in Jacobian
+ * coordinates */
+
+/* point_double calcuates 2*(x_in, y_in, z_in)
+ *
+ * The method is taken from:
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+ *
+ * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
+ * while x_out == y_in is not (maybe this works, but it's not tested). */
+static void
+point_double(felem x_out, felem y_out, felem z_out,
+	     const felem x_in, const felem y_in, const felem z_in)
+	{
+	largefelem tmp, tmp2;
+	felem delta, gamma, beta, alpha, ftmp, ftmp2;
+
+	felem_assign(ftmp, x_in);
+	felem_assign(ftmp2, x_in);
+
+	/* delta = z^2 */
+	felem_square(tmp, z_in);
+	felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
+
+	/* gamma = y^2 */
+	felem_square(tmp, y_in);
+	felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
+
+	/* beta = x*gamma */
+	felem_mul(tmp, x_in, gamma);
+	felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
+
+	/* alpha = 3*(x-delta)*(x+delta) */
+	felem_diff64(ftmp, delta);
+	/* ftmp[i] < 2^61 */
+	felem_sum64(ftmp2, delta);
+	/* ftmp2[i] < 2^60 + 2^15 */
+	felem_scalar64(ftmp2, 3);
+	/* ftmp2[i] < 3*2^60 + 3*2^15 */
+	felem_mul(tmp, ftmp, ftmp2);
+	/* tmp[i] < 17(3*2^121 + 3*2^76)
+	 *        = 61*2^121 + 61*2^76
+	 *        < 64*2^121 + 64*2^76
+	 *        = 2^127 + 2^82
+	 *        < 2^128 */
+	felem_reduce(alpha, tmp);
+
+	/* x' = alpha^2 - 8*beta */
+	felem_square(tmp, alpha);
+	/* tmp[i] < 17*2^120
+	 *        < 2^125 */
+	felem_assign(ftmp, beta);
+	felem_scalar64(ftmp, 8);
+	/* ftmp[i] < 2^62 + 2^17 */
+	felem_diff_128_64(tmp, ftmp);
+	/* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
+	felem_reduce(x_out, tmp);
+
+	/* z' = (y + z)^2 - gamma - delta */
+	felem_sum64(delta, gamma);
+	/* delta[i] < 2^60 + 2^15 */
+	felem_assign(ftmp, y_in);
+	felem_sum64(ftmp, z_in);
+	/* ftmp[i] < 2^60 + 2^15 */
+	felem_square(tmp, ftmp);
+	/* tmp[i] < 17(2^122)
+	 *        < 2^127 */
+	felem_diff_128_64(tmp, delta);
+	/* tmp[i] < 2^127 + 2^63 */
+	felem_reduce(z_out, tmp);
+
+	/* y' = alpha*(4*beta - x') - 8*gamma^2 */
+	felem_scalar64(beta, 4);
+	/* beta[i] < 2^61 + 2^16 */
+	felem_diff64(beta, x_out);
+	/* beta[i] < 2^61 + 2^60 + 2^16 */
+	felem_mul(tmp, alpha, beta);
+	/* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
+	 *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30) 
+	 *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
+	 *        < 2^128 */
+	felem_square(tmp2, gamma);
+	/* tmp2[i] < 17*(2^59 + 2^14)^2
+	 *         = 17*(2^118 + 2^74 + 2^28) */
+	felem_scalar128(tmp2, 8);
+	/* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
+	 *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
+	 *         < 2^126 */
+	felem_diff128(tmp, tmp2);
+	/* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
+	 *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
+	 *          2^74 + 2^69 + 2^34 + 2^30
+	 *        < 2^128 */
+	felem_reduce(y_out, tmp);
+	}
+
+/* copy_conditional copies in to out iff mask is all ones. */
+static void
+copy_conditional(felem out, const felem in, limb mask)
+	{
+	unsigned i;
+	for (i = 0; i < NLIMBS; ++i)
+		{
+		const limb tmp = mask & (in[i] ^ out[i]);
+		out[i] ^= tmp;
+		}
+	}
+
+/* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
+ *
+ * The method is taken from
+ *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
+ * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
+ *
+ * This function includes a branch for checking whether the two input points
+ * are equal (while not equal to the point at infinity). This case never
+ * happens during single point multiplication, so there is no timing leak for
+ * ECDH or ECDSA signing. */
+static void point_add(felem x3, felem y3, felem z3,
+	const felem x1, const felem y1, const felem z1,
+	const int mixed, const felem x2, const felem y2, const felem z2)
+	{
+	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
+	largefelem tmp, tmp2;
+	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
+
+	z1_is_zero = felem_is_zero(z1);
+	z2_is_zero = felem_is_zero(z2);
+
+	/* ftmp = z1z1 = z1**2 */
+	felem_square(tmp, z1);
+	felem_reduce(ftmp, tmp);
+
+	if (!mixed)
+		{
+		/* ftmp2 = z2z2 = z2**2 */
+		felem_square(tmp, z2);
+		felem_reduce(ftmp2, tmp);
+
+		/* u1 = ftmp3 = x1*z2z2 */
+		felem_mul(tmp, x1, ftmp2);
+		felem_reduce(ftmp3, tmp);
+
+		/* ftmp5 = z1 + z2 */
+		felem_assign(ftmp5, z1);
+		felem_sum64(ftmp5, z2);
+		/* ftmp5[i] < 2^61 */
+
+		/* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
+		felem_square(tmp, ftmp5);
+		/* tmp[i] < 17*2^122 */
+		felem_diff_128_64(tmp, ftmp);
+		/* tmp[i] < 17*2^122 + 2^63 */
+		felem_diff_128_64(tmp, ftmp2);
+		/* tmp[i] < 17*2^122 + 2^64 */
+		felem_reduce(ftmp5, tmp);
+
+		/* ftmp2 = z2 * z2z2 */
+		felem_mul(tmp, ftmp2, z2);
+		felem_reduce(ftmp2, tmp);
+
+		/* s1 = ftmp6 = y1 * z2**3 */
+		felem_mul(tmp, y1, ftmp2);
+		felem_reduce(ftmp6, tmp);
+		}
+	else
+		{
+		/* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
+
+		/* u1 = ftmp3 = x1*z2z2 */
+		felem_assign(ftmp3, x1);
+
+		/* ftmp5 = 2*z1z2 */
+		felem_scalar(ftmp5, z1, 2);
+
+		/* s1 = ftmp6 = y1 * z2**3 */
+		felem_assign(ftmp6, y1);
+		}
+
+	/* u2 = x2*z1z1 */
+	felem_mul(tmp, x2, ftmp);
+	/* tmp[i] < 17*2^120 */
+
+	/* h = ftmp4 = u2 - u1 */
+	felem_diff_128_64(tmp, ftmp3);
+	/* tmp[i] < 17*2^120 + 2^63 */
+	felem_reduce(ftmp4, tmp);
+
+	x_equal = felem_is_zero(ftmp4);
+
+	/* z_out = ftmp5 * h */
+	felem_mul(tmp, ftmp5, ftmp4);
+	felem_reduce(z_out, tmp);
+
+	/* ftmp = z1 * z1z1 */
+	felem_mul(tmp, ftmp, z1);
+	felem_reduce(ftmp, tmp);
+
+	/* s2 = tmp = y2 * z1**3 */
+	felem_mul(tmp, y2, ftmp);
+	/* tmp[i] < 17*2^120 */
+
+	/* r = ftmp5 = (s2 - s1)*2 */
+	felem_diff_128_64(tmp, ftmp6);
+	/* tmp[i] < 17*2^120 + 2^63 */
+	felem_reduce(ftmp5, tmp);
+	y_equal = felem_is_zero(ftmp5);
+	felem_scalar64(ftmp5, 2);
+	/* ftmp5[i] < 2^61 */
+
+	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
+		{
+		point_double(x3, y3, z3, x1, y1, z1);
+		return;
+		}
+
+	/* I = ftmp = (2h)**2 */
+	felem_assign(ftmp, ftmp4);
+	felem_scalar64(ftmp, 2);
+	/* ftmp[i] < 2^61 */
+	felem_square(tmp, ftmp);
+	/* tmp[i] < 17*2^122 */
+	felem_reduce(ftmp, tmp);
+
+	/* J = ftmp2 = h * I */
+	felem_mul(tmp, ftmp4, ftmp);
+	felem_reduce(ftmp2, tmp);
+
+	/* V = ftmp4 = U1 * I */
+	felem_mul(tmp, ftmp3, ftmp);
+	felem_reduce(ftmp4, tmp);
+
+	/* x_out = r**2 - J - 2V */
+	felem_square(tmp, ftmp5);
+	/* tmp[i] < 17*2^122 */
+	felem_diff_128_64(tmp, ftmp2);
+	/* tmp[i] < 17*2^122 + 2^63 */
+	felem_assign(ftmp3, ftmp4);
+	felem_scalar64(ftmp4, 2);
+	/* ftmp4[i] < 2^61 */
+	felem_diff_128_64(tmp, ftmp4);
+	/* tmp[i] < 17*2^122 + 2^64 */
+	felem_reduce(x_out, tmp);
+
+	/* y_out = r(V-x_out) - 2 * s1 * J */
+	felem_diff64(ftmp3, x_out);
+	/* ftmp3[i] < 2^60 + 2^60
+	 *          = 2^61 */
+	felem_mul(tmp, ftmp5, ftmp3);
+	/* tmp[i] < 17*2^122 */
+	felem_mul(tmp2, ftmp6, ftmp2);
+	/* tmp2[i] < 17*2^120 */
+	felem_scalar128(tmp2, 2);
+	/* tmp2[i] < 17*2^121 */
+	felem_diff128(tmp, tmp2);
+	/* tmp[i] < 2^127 - 2^69 + 17*2^122
+	 *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
+	 *        < 2^127 */
+	felem_reduce(y_out, tmp);
+
+	copy_conditional(x_out, x2, z1_is_zero);
+	copy_conditional(x_out, x1, z2_is_zero);
+	copy_conditional(y_out, y2, z1_is_zero);
+	copy_conditional(y_out, y1, z2_is_zero);
+	copy_conditional(z_out, z2, z1_is_zero);
+	copy_conditional(z_out, z1, z2_is_zero);
+	felem_assign(x3, x_out);
+	felem_assign(y3, y_out);
+	felem_assign(z3, z_out);
+	}
+
+/* Base point pre computation
+ * --------------------------
+ *
+ * Two different sorts of precomputed tables are used in the following code.
+ * Each contain various points on the curve, where each point is three field
+ * elements (x, y, z).
+ *
+ * For the base point table, z is usually 1 (0 for the point at infinity).
+ * This table has 16 elements:
+ * index | bits    | point
+ * ------+---------+------------------------------
+ *     0 | 0 0 0 0 | 0G
+ *     1 | 0 0 0 1 | 1G
+ *     2 | 0 0 1 0 | 2^130G
+ *     3 | 0 0 1 1 | (2^130 + 1)G
+ *     4 | 0 1 0 0 | 2^260G
+ *     5 | 0 1 0 1 | (2^260 + 1)G
+ *     6 | 0 1 1 0 | (2^260 + 2^130)G
+ *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
+ *     8 | 1 0 0 0 | 2^390G
+ *     9 | 1 0 0 1 | (2^390 + 1)G
+ *    10 | 1 0 1 0 | (2^390 + 2^130)G
+ *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
+ *    12 | 1 1 0 0 | (2^390 + 2^260)G
+ *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
+ *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
+ *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
+ *
+ * The reason for this is so that we can clock bits into four different
+ * locations when doing simple scalar multiplies against the base point.
+ *
+ * Tables for other points have table[i] = iG for i in 0 .. 16. */
+
+/* gmul is the table of precomputed base points */
+static const felem gmul[16][3] =
+	{{{0, 0, 0, 0, 0, 0, 0, 0, 0},
+	  {0, 0, 0, 0, 0, 0, 0, 0, 0},
+	  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
+	   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
+	   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
+	  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
+	   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
+	   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
+	   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
+	   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
+	  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
+	   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
+	   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
+	   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
+	   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
+	  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
+	   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
+	   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
+	   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
+	   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
+	  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
+	   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
+	   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
+	   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
+	   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
+	  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
+	   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
+	   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
+	   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
+	   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
+	  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
+	   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
+	   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
+	   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
+	   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
+	  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
+	   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
+	   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
+	   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
+	   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
+	  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
+	   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
+	   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
+	   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
+	   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
+	  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
+	   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
+	   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
+	   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
+	   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
+	  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
+	   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
+	   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
+	   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
+	   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
+	  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
+	   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
+	   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
+	   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
+	   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
+	  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
+	   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
+	   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
+	   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
+	   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
+	  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
+	   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
+	   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
+	   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
+	   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
+	  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
+	   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
+	   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
+	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
+	 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
+	   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
+	   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
+	  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
+	   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
+	   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
+	 {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
+
+/* select_point selects the |idx|th point from a precomputation table and
+ * copies it to out. */
+static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
+			 felem out[3])
+	{
+	unsigned i, j;
+	limb *outlimbs = &out[0][0];
+	memset(outlimbs, 0, 3 * sizeof(felem));
+
+	for (i = 0; i < size; i++)
+		{
+		const limb *inlimbs = &pre_comp[i][0][0];
+		limb mask = i ^ idx;
+		mask |= mask >> 4;
+		mask |= mask >> 2;
+		mask |= mask >> 1;
+		mask &= 1;
+		mask--;
+		for (j = 0; j < NLIMBS * 3; j++)
+			outlimbs[j] |= inlimbs[j] & mask;
+		}
+	}
+
+/* get_bit returns the |i|th bit in |in| */
+static char get_bit(const felem_bytearray in, int i)
+	{
+	if (i < 0)
+		return 0;
+	return (in[i >> 3] >> (i & 7)) & 1;
+	}
+
+/* Interleaved point multiplication using precomputed point multiples:
+ * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
+ * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
+ * of the generator, using certain (large) precomputed multiples in g_pre_comp.
+ * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
+static void batch_mul(felem x_out, felem y_out, felem z_out,
+	const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
+	const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
+	{
+	int i, skip;
+	unsigned num, gen_mul = (g_scalar != NULL);
+	felem nq[3], tmp[4];
+	limb bits;
+	u8 sign, digit;
+
+	/* set nq to the point at infinity */
+	memset(nq, 0, 3 * sizeof(felem));
+
+	/* Loop over all scalars msb-to-lsb, interleaving additions
+	 * of multiples of the generator (last quarter of rounds)
+	 * and additions of other points multiples (every 5th round).
+	 */
+	skip = 1; /* save two point operations in the first round */
+	for (i = (num_points ? 520 : 130); i >= 0; --i)
+		{
+		/* double */
+		if (!skip)
+			point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
+
+		/* add multiples of the generator */
+		if (gen_mul && (i <= 130))
+			{
+			bits = get_bit(g_scalar, i + 390) << 3;
+			if (i < 130)
+				{
+				bits |= get_bit(g_scalar, i + 260) << 2;
+				bits |= get_bit(g_scalar, i + 130) << 1;
+				bits |= get_bit(g_scalar, i);
+				}
+			/* select the point to add, in constant time */
+			select_point(bits, 16, g_pre_comp, tmp);
+			if (!skip)
+				{
+				point_add(nq[0], nq[1], nq[2],
+					nq[0], nq[1], nq[2],
+					1 /* mixed */, tmp[0], tmp[1], tmp[2]);
+				}
+			else
+				{
+				memcpy(nq, tmp, 3 * sizeof(felem));
+				skip = 0;
+				}
+			}
+
+		/* do other additions every 5 doublings */
+		if (num_points && (i % 5 == 0))
+			{
+			/* loop over all scalars */
+			for (num = 0; num < num_points; ++num)
+				{
+				bits = get_bit(scalars[num], i + 4) << 5;
+				bits |= get_bit(scalars[num], i + 3) << 4;
+				bits |= get_bit(scalars[num], i + 2) << 3;
+				bits |= get_bit(scalars[num], i + 1) << 2;
+				bits |= get_bit(scalars[num], i) << 1;
+				bits |= get_bit(scalars[num], i - 1);
+				ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
+
+				/* select the point to add or subtract, in constant time */
+				select_point(digit, 17, pre_comp[num], tmp);
+				felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
+				copy_conditional(tmp[1], tmp[3], (-(limb) sign));
+
+				if (!skip)
+					{
+					point_add(nq[0], nq[1], nq[2],
+						nq[0], nq[1], nq[2],
+						mixed, tmp[0], tmp[1], tmp[2]);
+					}
+				else
+					{
+					memcpy(nq, tmp, 3 * sizeof(felem));
+					skip = 0;
+					}
+				}
+			}
+		}
+	felem_assign(x_out, nq[0]);
+	felem_assign(y_out, nq[1]);
+	felem_assign(z_out, nq[2]);
+	}
+
+
+/* Precomputation for the group generator. */
+typedef struct {
+	felem g_pre_comp[16][3];
+	int references;
+} NISTP521_PRE_COMP;
+
+const EC_METHOD *EC_GFp_nistp521_method(void)
+	{
+	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
+		NID_X9_62_prime_field,
+		ec_GFp_nistp521_group_init,
+		ec_GFp_simple_group_finish,
+		ec_GFp_simple_group_clear_finish,
+		ec_GFp_nist_group_copy,
+		ec_GFp_nistp521_group_set_curve,
+		ec_GFp_simple_group_get_curve,
+		ec_GFp_simple_group_get_degree,
+		ec_GFp_simple_group_check_discriminant,
+		ec_GFp_simple_point_init,
+		ec_GFp_simple_point_finish,
+		ec_GFp_simple_point_clear_finish,
+		ec_GFp_simple_point_copy,
+		ec_GFp_simple_point_set_to_infinity,
+		ec_GFp_simple_set_Jprojective_coordinates_GFp,
+		ec_GFp_simple_get_Jprojective_coordinates_GFp,
+		ec_GFp_simple_point_set_affine_coordinates,
+		ec_GFp_nistp521_point_get_affine_coordinates,
+		0 /* point_set_compressed_coordinates */,
+		0 /* point2oct */,
+		0 /* oct2point */,
+		ec_GFp_simple_add,
+		ec_GFp_simple_dbl,
+		ec_GFp_simple_invert,
+		ec_GFp_simple_is_at_infinity,
+		ec_GFp_simple_is_on_curve,
+		ec_GFp_simple_cmp,
+		ec_GFp_simple_make_affine,
+		ec_GFp_simple_points_make_affine,
+		ec_GFp_nistp521_points_mul,
+		ec_GFp_nistp521_precompute_mult,
+		ec_GFp_nistp521_have_precompute_mult,
+		ec_GFp_nist_field_mul,
+		ec_GFp_nist_field_sqr,
+		0 /* field_div */,
+		0 /* field_encode */,
+		0 /* field_decode */,
+		0 /* field_set_to_one */ };
+
+	return &ret;
+	}
+
+
+/******************************************************************************/
+/*		       FUNCTIONS TO MANAGE PRECOMPUTATION
+ */
+
+static NISTP521_PRE_COMP *nistp521_pre_comp_new()
+	{
+	NISTP521_PRE_COMP *ret = NULL;
+	ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
+	if (!ret)
+		{
+		ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
+		return ret;
+		}
+	memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
+	ret->references = 1;
+	return ret;
+	}
+
+static void *nistp521_pre_comp_dup(void *src_)
+	{
+	NISTP521_PRE_COMP *src = src_;
+
+	/* no need to actually copy, these objects never change! */
+	CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
+
+	return src_;
+	}
+
+static void nistp521_pre_comp_free(void *pre_)
+	{
+	int i;
+	NISTP521_PRE_COMP *pre = pre_;
+
+	if (!pre)
+		return;
+
+	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+	if (i > 0)
+		return;
+
+	OPENSSL_free(pre);
+	}
+
+static void nistp521_pre_comp_clear_free(void *pre_)
+	{
+	int i;
+	NISTP521_PRE_COMP *pre = pre_;
+
+	if (!pre)
+		return;
+
+	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
+	if (i > 0)
+		return;
+
+	OPENSSL_cleanse(pre, sizeof(*pre));
+	OPENSSL_free(pre);
+	}
+
+/******************************************************************************/
+/*			   OPENSSL EC_METHOD FUNCTIONS
+ */
+
+int ec_GFp_nistp521_group_init(EC_GROUP *group)
+	{
+	int ret;
+	ret = ec_GFp_simple_group_init(group);
+	group->a_is_minus3 = 1;
+	return ret;
+	}
+
+int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
+	const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
+	{
+	int ret = 0;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *curve_p, *curve_a, *curve_b;
+
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
+		((curve_a = BN_CTX_get(ctx)) == NULL) ||
+		((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
+	BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
+	BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
+	BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
+	if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
+		(BN_cmp(curve_b, b)))
+		{
+		ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
+			EC_R_WRONG_CURVE_PARAMETERS);
+		goto err;
+		}
+	group->field_mod_func = BN_nist_mod_521;
+	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
+err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
+ * (X', Y') = (X/Z^2, Y/Z^3) */
+int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
+	const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
+	{
+	felem z1, z2, x_in, y_in, x_out, y_out;
+	largefelem tmp;
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
+			EC_R_POINT_AT_INFINITY);
+		return 0;
+		}
+	if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
+		(!BN_to_felem(z1, &point->Z))) return 0;
+	felem_inv(z2, z1);
+	felem_square(tmp, z2); felem_reduce(z1, tmp);
+	felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
+	felem_contract(x_out, x_in);
+	if (x != NULL)
+		{
+		if (!felem_to_BN(x, x_out))
+			{
+			ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
+			return 0;
+			}
+		}
+	felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
+	felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
+	felem_contract(y_out, y_in);
+	if (y != NULL)
+		{
+		if (!felem_to_BN(y, y_out))
+			{
+			ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
+			return 0;
+			}
+		}
+	return 1;
+	}
+
+static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
+	{
+	/* Runs in constant time, unless an input is the point at infinity
+	 * (which normally shouldn't happen). */
+	ec_GFp_nistp_points_make_affine_internal(
+		num,
+		points,
+		sizeof(felem),
+		tmp_felems,
+		(void (*)(void *)) felem_one,
+		(int (*)(const void *)) felem_is_zero_int,
+		(void (*)(void *, const void *)) felem_assign,
+		(void (*)(void *, const void *)) felem_square_reduce,
+		(void (*)(void *, const void *, const void *)) felem_mul_reduce,
+		(void (*)(void *, const void *)) felem_inv,
+		(void (*)(void *, const void *)) felem_contract);
+	}
+
+/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
+ * Result is stored in r (r can equal one of the inputs). */
+int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
+	const BIGNUM *scalar, size_t num, const EC_POINT *points[],
+	const BIGNUM *scalars[], BN_CTX *ctx)
+	{
+	int ret = 0;
+	int j;
+	int mixed = 0;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y, *z, *tmp_scalar;
+	felem_bytearray g_secret;
+	felem_bytearray *secrets = NULL;
+	felem (*pre_comp)[17][3] = NULL;
+	felem *tmp_felems = NULL;
+	felem_bytearray tmp;
+	unsigned i, num_bytes;
+	int have_pre_comp = 0;
+	size_t num_points = num;
+	felem x_in, y_in, z_in, x_out, y_out, z_out;
+	NISTP521_PRE_COMP *pre = NULL;
+	felem (*g_pre_comp)[3] = NULL;
+	EC_POINT *generator = NULL;
+	const EC_POINT *p = NULL;
+	const BIGNUM *p_scalar = NULL;
+
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((x = BN_CTX_get(ctx)) == NULL) ||
+		((y = BN_CTX_get(ctx)) == NULL) ||
+		((z = BN_CTX_get(ctx)) == NULL) ||
+		((tmp_scalar = BN_CTX_get(ctx)) == NULL))
+		goto err;
+
+	if (scalar != NULL)
+		{
+		pre = EC_EX_DATA_get_data(group->extra_data,
+			nistp521_pre_comp_dup, nistp521_pre_comp_free,
+			nistp521_pre_comp_clear_free);
+		if (pre)
+			/* we have precomputation, try to use it */
+			g_pre_comp = &pre->g_pre_comp[0];
+		else
+			/* try to use the standard precomputation */
+			g_pre_comp = (felem (*)[3]) gmul;
+		generator = EC_POINT_new(group);
+		if (generator == NULL)
+			goto err;
+		/* get the generator from precomputation */
+		if (!felem_to_BN(x, g_pre_comp[1][0]) ||
+			!felem_to_BN(y, g_pre_comp[1][1]) ||
+			!felem_to_BN(z, g_pre_comp[1][2]))
+			{
+			ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+			goto err;
+			}
+		if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
+				generator, x, y, z, ctx))
+			goto err;
+		if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+			/* precomputation matches generator */
+			have_pre_comp = 1;
+		else
+			/* we don't have valid precomputation:
+			 * treat the generator as a random point */
+			num_points++;
+		}
+
+	if (num_points > 0)
+		{
+		if (num_points >= 2)
+			{
+			/* unless we precompute multiples for just one point,
+			 * converting those into affine form is time well spent  */
+			mixed = 1;
+			}
+		secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
+		pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
+		if (mixed)
+			tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
+		if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
+			{
+			ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
+			goto err;
+			}
+
+		/* we treat NULL scalars as 0, and NULL points as points at infinity,
+		 * i.e., they contribute nothing to the linear combination */
+		memset(secrets, 0, num_points * sizeof(felem_bytearray));
+		memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
+		for (i = 0; i < num_points; ++i)
+			{
+			if (i == num)
+				/* we didn't have a valid precomputation, so we pick
+				 * the generator */
+				{
+				p = EC_GROUP_get0_generator(group);
+				p_scalar = scalar;
+				}
+			else
+				/* the i^th point */
+				{
+				p = points[i];
+				p_scalar = scalars[i];
+				}
+			if ((p_scalar != NULL) && (p != NULL))
+				{
+				/* reduce scalar to 0 <= scalar < 2^521 */
+				if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
+					{
+					/* this is an unusual input, and we don't guarantee
+					 * constant-timeness */
+					if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
+						{
+						ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+						goto err;
+						}
+					num_bytes = BN_bn2bin(tmp_scalar, tmp);
+					}
+				else
+					num_bytes = BN_bn2bin(p_scalar, tmp);
+				flip_endian(secrets[i], tmp, num_bytes);
+				/* precompute multiples */
+				if ((!BN_to_felem(x_out, &p->X)) ||
+					(!BN_to_felem(y_out, &p->Y)) ||
+					(!BN_to_felem(z_out, &p->Z))) goto err;
+				memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
+				memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
+				memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
+				for (j = 2; j <= 16; ++j)
+					{
+					if (j & 1)
+						{
+						point_add(
+							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+							pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
+							0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
+						}
+					else
+						{
+						point_double(
+							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
+							pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
+						}
+					}
+				}
+			}
+		if (mixed)
+			make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
+		}
+
+	/* the scalar for the generator */
+	if ((scalar != NULL) && (have_pre_comp))
+		{
+		memset(g_secret, 0, sizeof(g_secret));
+		/* reduce scalar to 0 <= scalar < 2^521 */
+		if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
+			{
+			/* this is an unusual input, and we don't guarantee
+			 * constant-timeness */
+			if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
+				{
+				ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+				goto err;
+				}
+			num_bytes = BN_bn2bin(tmp_scalar, tmp);
+			}
+		else
+			num_bytes = BN_bn2bin(scalar, tmp);
+		flip_endian(g_secret, tmp, num_bytes);
+		/* do the multiplication with generator precomputation*/
+		batch_mul(x_out, y_out, z_out,
+			(const felem_bytearray (*)) secrets, num_points,
+			g_secret,
+			mixed, (const felem (*)[17][3]) pre_comp,
+			(const felem (*)[3]) g_pre_comp);
+		}
+	else
+		/* do the multiplication without generator precomputation */
+		batch_mul(x_out, y_out, z_out,
+			(const felem_bytearray (*)) secrets, num_points,
+			NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
+	/* reduce the output to its unique minimal representation */
+	felem_contract(x_in, x_out);
+	felem_contract(y_in, y_out);
+	felem_contract(z_in, z_out);
+	if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
+		(!felem_to_BN(z, z_in)))
+		{
+		ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
+		goto err;
+		}
+	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
+
+err:
+	BN_CTX_end(ctx);
+	if (generator != NULL)
+		EC_POINT_free(generator);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	if (secrets != NULL)
+		OPENSSL_free(secrets);
+	if (pre_comp != NULL)
+		OPENSSL_free(pre_comp);
+	if (tmp_felems != NULL)
+		OPENSSL_free(tmp_felems);
+	return ret;
+	}
+
+int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
+	{
+	int ret = 0;
+	NISTP521_PRE_COMP *pre = NULL;
+	int i, j;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y;
+	EC_POINT *generator = NULL;
+	felem tmp_felems[16];
+
+	/* throw away old precomputation */
+	EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
+		nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
+	if (ctx == NULL)
+		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
+	BN_CTX_start(ctx);
+	if (((x = BN_CTX_get(ctx)) == NULL) ||
+		((y = BN_CTX_get(ctx)) == NULL))
+		goto err;
+	/* get the generator */
+	if (group->generator == NULL) goto err;
+	generator = EC_POINT_new(group);
+	if (generator == NULL)
+		goto err;
+	BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
+	BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
+	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
+		goto err;
+	if ((pre = nistp521_pre_comp_new()) == NULL)
+		goto err;
+	/* if the generator is the standard one, use built-in precomputation */
+	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
+		{
+		memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
+		ret = 1;
+		goto err;
+		}
+	if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
+		(!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
+		(!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
+		goto err;
+	/* compute 2^130*G, 2^260*G, 2^390*G */
+	for (i = 1; i <= 4; i <<= 1)
+		{
+		point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
+			pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
+			pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
+		for (j = 0; j < 129; ++j)
+			{
+			point_double(pre->g_pre_comp[2*i][0],
+				pre->g_pre_comp[2*i][1],
+				pre->g_pre_comp[2*i][2],
+				pre->g_pre_comp[2*i][0],
+				pre->g_pre_comp[2*i][1],
+				pre->g_pre_comp[2*i][2]);
+			}
+		}
+	/* g_pre_comp[0] is the point at infinity */
+	memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
+	/* the remaining multiples */
+	/* 2^130*G + 2^260*G */
+	point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
+		pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
+		pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
+		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
+		pre->g_pre_comp[2][2]);
+	/* 2^130*G + 2^390*G */
+	point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
+		pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
+		pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
+		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
+		pre->g_pre_comp[2][2]);
+	/* 2^260*G + 2^390*G */
+	point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
+		pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
+		pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
+		0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
+		pre->g_pre_comp[4][2]);
+	/* 2^130*G + 2^260*G + 2^390*G */
+	point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
+		pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
+		pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
+		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
+		pre->g_pre_comp[2][2]);
+	for (i = 1; i < 8; ++i)
+		{
+		/* odd multiples: add G */
+		point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
+			pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
+			pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
+			0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
+			pre->g_pre_comp[1][2]);
+		}
+	make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
+
+	if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
+			nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
+		goto err;
+	ret = 1;
+	pre = NULL;
+ err:
+	BN_CTX_end(ctx);
+	if (generator != NULL)
+		EC_POINT_free(generator);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	if (pre)
+		nistp521_pre_comp_free(pre);
+	return ret;
+	}
+
+int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
+	{
+	if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
+			nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
+		!= NULL)
+		return 1;
+	else
+		return 0;
+	}
+
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_nistputil.c b/src/lib/libcrypto/ec/ecp_nistputil.c
new file mode 100644
index 0000000000..c8140c807f
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_nistputil.c
@@ -0,0 +1,197 @@
+/* crypto/ec/ecp_nistputil.c */
+/*
+ * Written by Bodo Moeller for the OpenSSL project.
+ */
+/* Copyright 2011 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ *
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <openssl/opensslconf.h>
+#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
+
+/*
+ * Common utility functions for ecp_nistp224.c, ecp_nistp256.c, ecp_nistp521.c.
+ */
+
+#include <stddef.h>
+#include "ec_lcl.h"
+
+/* Convert an array of points into affine coordinates.
+ * (If the point at infinity is found (Z = 0), it remains unchanged.)
+ * This function is essentially an equivalent to EC_POINTs_make_affine(), but
+ * works with the internal representation of points as used by ecp_nistp###.c
+ * rather than with (BIGNUM-based) EC_POINT data structures.
+ *
+ * point_array is the input/output buffer ('num' points in projective form,
+ * i.e. three coordinates each), based on an internal representation of
+ * field elements of size 'felem_size'.
+ *
+ * tmp_felems needs to point to a temporary array of 'num'+1 field elements
+ * for storage of intermediate values.
+ */
+void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array,
+	size_t felem_size, void *tmp_felems,
+	void (*felem_one)(void *out),
+	int (*felem_is_zero)(const void *in),
+	void (*felem_assign)(void *out, const void *in),
+	void (*felem_square)(void *out, const void *in),
+	void (*felem_mul)(void *out, const void *in1, const void *in2),
+	void (*felem_inv)(void *out, const void *in),
+	void (*felem_contract)(void *out, const void *in))
+	{
+	int i = 0;
+
+#define tmp_felem(I) (&((char *)tmp_felems)[(I) * felem_size])
+#define X(I) (&((char *)point_array)[3*(I) * felem_size])
+#define Y(I) (&((char *)point_array)[(3*(I) + 1) * felem_size])
+#define Z(I) (&((char *)point_array)[(3*(I) + 2) * felem_size])
+
+	if (!felem_is_zero(Z(0)))
+		felem_assign(tmp_felem(0), Z(0));
+	else
+		felem_one(tmp_felem(0));
+	for (i = 1; i < (int)num; i++)
+		{
+		if (!felem_is_zero(Z(i)))
+			felem_mul(tmp_felem(i), tmp_felem(i-1), Z(i));
+		else
+			felem_assign(tmp_felem(i), tmp_felem(i-1));
+		}
+	/* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any zero-valued factors:
+	 * if Z(i) = 0, we essentially pretend that Z(i) = 1 */
+
+	felem_inv(tmp_felem(num-1), tmp_felem(num-1));
+	for (i = num - 1; i >= 0; i--)
+		{
+		if (i > 0)
+			/* tmp_felem(i-1) is the product of Z(0) .. Z(i-1),
+			 * tmp_felem(i) is the inverse of the product of Z(0) .. Z(i)
+			 */
+			felem_mul(tmp_felem(num), tmp_felem(i-1), tmp_felem(i)); /* 1/Z(i) */
+		else
+			felem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */
+
+		if (!felem_is_zero(Z(i)))
+			{
+			if (i > 0)
+				/* For next iteration, replace tmp_felem(i-1) by its inverse */
+				felem_mul(tmp_felem(i-1), tmp_felem(i), Z(i));
+
+			/* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1) */
+			felem_square(Z(i), tmp_felem(num)); /* 1/(Z^2) */
+			felem_mul(X(i), X(i), Z(i)); /* X/(Z^2) */
+			felem_mul(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */
+			felem_mul(Y(i), Y(i), Z(i)); /* Y/(Z^3) */
+			felem_contract(X(i), X(i));
+			felem_contract(Y(i), Y(i));
+			felem_one(Z(i));
+			}
+		else
+			{
+			if (i > 0)
+				/* For next iteration, replace tmp_felem(i-1) by its inverse */
+				felem_assign(tmp_felem(i-1), tmp_felem(i));
+			}
+		}
+	}
+
+/*
+ * This function looks at 5+1 scalar bits (5 current, 1 adjacent less
+ * significant bit), and recodes them into a signed digit for use in fast point
+ * multiplication: the use of signed rather than unsigned digits means that
+ * fewer points need to be precomputed, given that point inversion is easy
+ * (a precomputed point dP makes -dP available as well).
+ *
+ * BACKGROUND:
+ *
+ * Signed digits for multiplication were introduced by Booth ("A signed binary
+ * multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV,
+ * pt. 2 (1951), pp. 236-240), in that case for multiplication of integers.
+ * Booth's original encoding did not generally improve the density of nonzero
+ * digits over the binary representation, and was merely meant to simplify the
+ * handling of signed factors given in two's complement; but it has since been
+ * shown to be the basis of various signed-digit representations that do have
+ * further advantages, including the wNAF, using the following general approach:
+ *
+ * (1) Given a binary representation
+ *
+ *       b_k  ...  b_2  b_1  b_0,
+ *
+ *     of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1
+ *     by using bit-wise subtraction as follows:
+ *
+ *        b_k b_(k-1)  ...  b_2  b_1  b_0
+ *      -     b_k      ...  b_3  b_2  b_1  b_0
+ *       -------------------------------------
+ *        s_k b_(k-1)  ...  s_3  s_2  s_1  s_0
+ *
+ *     A left-shift followed by subtraction of the original value yields a new
+ *     representation of the same value, using signed bits s_i = b_(i+1) - b_i.
+ *     This representation from Booth's paper has since appeared in the
+ *     literature under a variety of different names including "reversed binary
+ *     form", "alternating greedy expansion", "mutual opposite form", and
+ *     "sign-alternating {+-1}-representation".
+ *
+ *     An interesting property is that among the nonzero bits, values 1 and -1
+ *     strictly alternate.
+ *
+ * (2) Various window schemes can be applied to the Booth representation of
+ *     integers: for example, right-to-left sliding windows yield the wNAF
+ *     (a signed-digit encoding independently discovered by various researchers
+ *     in the 1990s), and left-to-right sliding windows yield a left-to-right
+ *     equivalent of the wNAF (independently discovered by various researchers
+ *     around 2004).
+ *
+ * To prevent leaking information through side channels in point multiplication,
+ * we need to recode the given integer into a regular pattern: sliding windows
+ * as in wNAFs won't do, we need their fixed-window equivalent -- which is a few
+ * decades older: we'll be using the so-called "modified Booth encoding" due to
+ * MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49
+ * (1961), pp. 67-91), in a radix-2^5 setting.  That is, we always combine five
+ * signed bits into a signed digit:
+ *
+ *       s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j)
+ *
+ * The sign-alternating property implies that the resulting digit values are
+ * integers from -16 to 16.
+ *
+ * Of course, we don't actually need to compute the signed digits s_i as an
+ * intermediate step (that's just a nice way to see how this scheme relates
+ * to the wNAF): a direct computation obtains the recoded digit from the
+ * six bits b_(4j + 4) ... b_(4j - 1).
+ *
+ * This function takes those five bits as an integer (0 .. 63), writing the
+ * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute
+ * value, in the range 0 .. 8).  Note that this integer essentially provides the
+ * input bits "shifted to the left" by one position: for example, the input to
+ * compute the least significant recoded digit, given that there's no bit b_-1,
+ * has to be b_4 b_3 b_2 b_1 b_0 0.
+ *
+ */
+void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in)
+	{
+	unsigned char s, d;
+
+	s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as 6-bit value */
+	d = (1 << 6) - in - 1;
+	d = (d & s) | (in & ~s);
+	d = (d >> 1) + (d & 1);
+
+	*sign = s & 1;
+	*digit = d;
+	}
+#else
+static void *dummy=&dummy;
+#endif
diff --git a/src/lib/libcrypto/ec/ecp_oct.c b/src/lib/libcrypto/ec/ecp_oct.c
new file mode 100644
index 0000000000..374a0ee731
--- /dev/null
+++ b/src/lib/libcrypto/ec/ecp_oct.c
@@ -0,0 +1,433 @@
+/* crypto/ec/ecp_oct.c */
+/* Includes code written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * for the OpenSSL project. 
+ * Includes code written by Bodo Moeller for the OpenSSL project.
+*/
+/* ====================================================================
+ * Copyright (c) 1998-2002 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ * Portions of this software developed by SUN MICROSYSTEMS, INC.,
+ * and contributed to the OpenSSL project.
+ */
+
+#include <openssl/err.h>
+#include <openssl/symhacks.h>
+
+#include "ec_lcl.h"
+
+int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
+	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
+	{
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *tmp1, *tmp2, *x, *y;
+	int ret = 0;
+
+	/* clear error queue*/
+	ERR_clear_error();
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	y_bit = (y_bit != 0);
+
+	BN_CTX_start(ctx);
+	tmp1 = BN_CTX_get(ctx);
+	tmp2 = BN_CTX_get(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	/* Recover y.  We have a Weierstrass equation
+	 *     y^2 = x^3 + a*x + b,
+	 * so  y  is one of the square roots of  x^3 + a*x + b.
+	 */
+
+	/* tmp1 := x^3 */
+	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
+	if (group->meth->field_decode == 0)
+		{
+		/* field_{sqr,mul} work on standard representation */
+		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
+		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
+		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + a*x */
+	if (group->a_is_minus3)
+		{
+		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
+		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
+		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (group->meth->field_decode)
+			{
+			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
+			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
+			}
+		else
+			{
+			/* field_mul works on standard representation */
+			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
+			}
+		
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	
+	/* tmp1 := tmp1 + b */
+	if (group->meth->field_decode)
+		{
+		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
+		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
+		}
+	else
+		{
+		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
+		}
+	
+	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
+		{
+		unsigned long err = ERR_peek_last_error();
+		
+		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
+			{
+			ERR_clear_error();
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			}
+		else
+			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
+		goto err;
+		}
+
+	if (y_bit != BN_is_odd(y))
+		{
+		if (BN_is_zero(y))
+			{
+			int kron;
+
+			kron = BN_kronecker(x, &group->field, ctx);
+			if (kron == -2) goto err;
+
+			if (kron == 1)
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
+			else
+				/* BN_mod_sqrt() should have cought this error (not a square) */
+				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
+			goto err;
+			}
+		if (!BN_usub(y, &group->field, y)) goto err;
+		}
+	if (y_bit != BN_is_odd(y))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
+		goto err;
+		}
+
+	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+	ret = 1;
+
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
+
+size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
+	unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	size_t ret;
+	BN_CTX *new_ctx = NULL;
+	int used_ctx = 0;
+	BIGNUM *x, *y;
+	size_t field_len, i, skip;
+
+	if ((form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
+		goto err;
+		}
+
+	if (EC_POINT_is_at_infinity(group, point))
+		{
+		/* encodes to a single 0 octet */
+		if (buf != NULL)
+			{
+			if (len < 1)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+				return 0;
+				}
+			buf[0] = 0;
+			}
+		return 1;
+		}
+
+
+	/* ret := required output buffer length */
+	field_len = BN_num_bytes(&group->field);
+	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	/* if 'buf' is NULL, just return required length */
+	if (buf != NULL)
+		{
+		if (len < ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
+			goto err;
+			}
+
+		if (ctx == NULL)
+			{
+			ctx = new_ctx = BN_CTX_new();
+			if (ctx == NULL)
+				return 0;
+			}
+
+		BN_CTX_start(ctx);
+		used_ctx = 1;
+		x = BN_CTX_get(ctx);
+		y = BN_CTX_get(ctx);
+		if (y == NULL) goto err;
+
+		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+
+		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
+			buf[0] = form + 1;
+		else
+			buf[0] = form;
+	
+		i = 1;
+		
+		skip = field_len - BN_num_bytes(x);
+		if (skip > field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		while (skip > 0)
+			{
+			buf[i++] = 0;
+			skip--;
+			}
+		skip = BN_bn2bin(x, buf + i);
+		i += skip;
+		if (i != 1 + field_len)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+
+		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
+			{
+			skip = field_len - BN_num_bytes(y);
+			if (skip > field_len)
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			while (skip > 0)
+				{
+				buf[i++] = 0;
+				skip--;
+				}
+			skip = BN_bn2bin(y, buf + i);
+			i += skip;
+			}
+
+		if (i != ret)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
+			goto err;
+			}
+		}
+	
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+
+ err:
+	if (used_ctx)
+		BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return 0;
+	}
+
+
+int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
+	const unsigned char *buf, size_t len, BN_CTX *ctx)
+	{
+	point_conversion_form_t form;
+	int y_bit;
+	BN_CTX *new_ctx = NULL;
+	BIGNUM *x, *y;
+	size_t field_len, enc_len;
+	int ret = 0;
+
+	if (len == 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
+		return 0;
+		}
+	form = buf[0];
+	y_bit = form & 1;
+	form = form & ~1U;
+	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
+		&& (form != POINT_CONVERSION_UNCOMPRESSED)
+		&& (form != POINT_CONVERSION_HYBRID))
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (form == 0)
+		{
+		if (len != 1)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			return 0;
+			}
+
+		return EC_POINT_set_to_infinity(group, point);
+		}
+	
+	field_len = BN_num_bytes(&group->field);
+	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
+
+	if (len != enc_len)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		return 0;
+		}
+
+	if (ctx == NULL)
+		{
+		ctx = new_ctx = BN_CTX_new();
+		if (ctx == NULL)
+			return 0;
+		}
+
+	BN_CTX_start(ctx);
+	x = BN_CTX_get(ctx);
+	y = BN_CTX_get(ctx);
+	if (y == NULL) goto err;
+
+	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
+	if (BN_ucmp(x, &group->field) >= 0)
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+		goto err;
+		}
+
+	if (form == POINT_CONVERSION_COMPRESSED)
+		{
+		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
+		}
+	else
+		{
+		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
+		if (BN_ucmp(y, &group->field) >= 0)
+			{
+			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+			goto err;
+			}
+		if (form == POINT_CONVERSION_HYBRID)
+			{
+			if (y_bit != BN_is_odd(y))
+				{
+				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
+				goto err;
+				}
+			}
+
+		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
+		}
+	
+	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
+		{
+		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
+		goto err;
+		}
+
+	ret = 1;
+	
+ err:
+	BN_CTX_end(ctx);
+	if (new_ctx != NULL)
+		BN_CTX_free(new_ctx);
+	return ret;
+	}
+
diff --git a/src/lib/libcrypto/ec/ecp_smpl.c b/src/lib/libcrypto/ec/ecp_smpl.c
index 66a92e2a90..7cbb321f9a 100644
--- a/src/lib/libcrypto/ec/ecp_smpl.c
+++ b/src/lib/libcrypto/ec/ecp_smpl.c
@@ -65,11 +65,19 @@
 #include <openssl/err.h>
 #include <openssl/symhacks.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 #include "ec_lcl.h"
 
 const EC_METHOD *EC_GFp_simple_method(void)
 	{
+#ifdef OPENSSL_FIPS
+	return fips_ec_gfp_simple_method();
+#else
 	static const EC_METHOD ret = {
+		EC_FLAGS_DEFAULT_OCT,
 		NID_X9_62_prime_field,
 		ec_GFp_simple_group_init,
 		ec_GFp_simple_group_finish,
@@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
 		ec_GFp_simple_point_set_affine_coordinates,
 		ec_GFp_simple_point_get_affine_coordinates,
-		ec_GFp_simple_set_compressed_coordinates,
-		ec_GFp_simple_point2oct,
-		ec_GFp_simple_oct2point,
+		0,0,0,
 		ec_GFp_simple_add,
 		ec_GFp_simple_dbl,
 		ec_GFp_simple_invert,
@@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_simple_method(void)
 		0 /* field_set_to_one */ };
 
 	return &ret;
+#endif
 	}
 
 
@@ -633,372 +640,6 @@ int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_P
 	return ret;
 	}
 
-
-int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point,
-	const BIGNUM *x_, int y_bit, BN_CTX *ctx)
-	{
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *tmp1, *tmp2, *x, *y;
-	int ret = 0;
-
-	/* clear error queue*/
-	ERR_clear_error();
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	y_bit = (y_bit != 0);
-
-	BN_CTX_start(ctx);
-	tmp1 = BN_CTX_get(ctx);
-	tmp2 = BN_CTX_get(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	/* Recover y.  We have a Weierstrass equation
-	 *     y^2 = x^3 + a*x + b,
-	 * so  y  is one of the square roots of  x^3 + a*x + b.
-	 */
-
-	/* tmp1 := x^3 */
-	if (!BN_nnmod(x, x_, &group->field,ctx)) goto err;
-	if (group->meth->field_decode == 0)
-		{
-		/* field_{sqr,mul} work on standard representation */
-		if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err;
-		if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err;
-		if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + a*x */
-	if (group->a_is_minus3)
-		{
-		if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err;
-		if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err;
-		if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (group->meth->field_decode)
-			{
-			if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err;
-			if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err;
-			}
-		else
-			{
-			/* field_mul works on standard representation */
-			if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err;
-			}
-		
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	
-	/* tmp1 := tmp1 + b */
-	if (group->meth->field_decode)
-		{
-		if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err;
-		if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err;
-		}
-	else
-		{
-		if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err;
-		}
-	
-	if (!BN_mod_sqrt(y, tmp1, &group->field, ctx))
-		{
-		unsigned long err = ERR_peek_last_error();
-		
-		if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE)
-			{
-			ERR_clear_error();
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			}
-		else
-			ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB);
-		goto err;
-		}
-
-	if (y_bit != BN_is_odd(y))
-		{
-		if (BN_is_zero(y))
-			{
-			int kron;
-
-			kron = BN_kronecker(x, &group->field, ctx);
-			if (kron == -2) goto err;
-
-			if (kron == 1)
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT);
-			else
-				/* BN_mod_sqrt() should have cought this error (not a square) */
-				ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT);
-			goto err;
-			}
-		if (!BN_usub(y, &group->field, y)) goto err;
-		}
-	if (y_bit != BN_is_odd(y))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR);
-		goto err;
-		}
-
-	if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-	ret = 1;
-
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
-size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form,
-	unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	size_t ret;
-	BN_CTX *new_ctx = NULL;
-	int used_ctx = 0;
-	BIGNUM *x, *y;
-	size_t field_len, i, skip;
-
-	if ((form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM);
-		goto err;
-		}
-
-	if (EC_POINT_is_at_infinity(group, point))
-		{
-		/* encodes to a single 0 octet */
-		if (buf != NULL)
-			{
-			if (len < 1)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-				return 0;
-				}
-			buf[0] = 0;
-			}
-		return 1;
-		}
-
-
-	/* ret := required output buffer length */
-	field_len = BN_num_bytes(&group->field);
-	ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	/* if 'buf' is NULL, just return required length */
-	if (buf != NULL)
-		{
-		if (len < ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL);
-			goto err;
-			}
-
-		if (ctx == NULL)
-			{
-			ctx = new_ctx = BN_CTX_new();
-			if (ctx == NULL)
-				return 0;
-			}
-
-		BN_CTX_start(ctx);
-		used_ctx = 1;
-		x = BN_CTX_get(ctx);
-		y = BN_CTX_get(ctx);
-		if (y == NULL) goto err;
-
-		if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-
-		if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y))
-			buf[0] = form + 1;
-		else
-			buf[0] = form;
-	
-		i = 1;
-		
-		skip = field_len - BN_num_bytes(x);
-		if (skip > field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		while (skip > 0)
-			{
-			buf[i++] = 0;
-			skip--;
-			}
-		skip = BN_bn2bin(x, buf + i);
-		i += skip;
-		if (i != 1 + field_len)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-
-		if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID)
-			{
-			skip = field_len - BN_num_bytes(y);
-			if (skip > field_len)
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-				goto err;
-				}
-			while (skip > 0)
-				{
-				buf[i++] = 0;
-				skip--;
-				}
-			skip = BN_bn2bin(y, buf + i);
-			i += skip;
-			}
-
-		if (i != ret)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
-		}
-	
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-
- err:
-	if (used_ctx)
-		BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return 0;
-	}
-
-
-int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point,
-	const unsigned char *buf, size_t len, BN_CTX *ctx)
-	{
-	point_conversion_form_t form;
-	int y_bit;
-	BN_CTX *new_ctx = NULL;
-	BIGNUM *x, *y;
-	size_t field_len, enc_len;
-	int ret = 0;
-
-	if (len == 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL);
-		return 0;
-		}
-	form = buf[0];
-	y_bit = form & 1;
-	form = form & ~1U;
-	if ((form != 0)	&& (form != POINT_CONVERSION_COMPRESSED)
-		&& (form != POINT_CONVERSION_UNCOMPRESSED)
-		&& (form != POINT_CONVERSION_HYBRID))
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-	if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (form == 0)
-		{
-		if (len != 1)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			return 0;
-			}
-
-		return EC_POINT_set_to_infinity(group, point);
-		}
-	
-	field_len = BN_num_bytes(&group->field);
-	enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len;
-
-	if (len != enc_len)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		return 0;
-		}
-
-	if (ctx == NULL)
-		{
-		ctx = new_ctx = BN_CTX_new();
-		if (ctx == NULL)
-			return 0;
-		}
-
-	BN_CTX_start(ctx);
-	x = BN_CTX_get(ctx);
-	y = BN_CTX_get(ctx);
-	if (y == NULL) goto err;
-
-	if (!BN_bin2bn(buf + 1, field_len, x)) goto err;
-	if (BN_ucmp(x, &group->field) >= 0)
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-		goto err;
-		}
-
-	if (form == POINT_CONVERSION_COMPRESSED)
-		{
-		if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err;
-		}
-	else
-		{
-		if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err;
-		if (BN_ucmp(y, &group->field) >= 0)
-			{
-			ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-			goto err;
-			}
-		if (form == POINT_CONVERSION_HYBRID)
-			{
-			if (y_bit != BN_is_odd(y))
-				{
-				ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING);
-				goto err;
-				}
-			}
-
-		if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err;
-		}
-	
-	if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */
-		{
-		ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE);
-		goto err;
-		}
-
-	ret = 1;
-	
- err:
-	BN_CTX_end(ctx);
-	if (new_ctx != NULL)
-		BN_CTX_free(new_ctx);
-	return ret;
-	}
-
-
 int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx)
 	{
 	int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *, BN_CTX *);
diff --git a/src/lib/libcrypto/ecdh/ecdh.h b/src/lib/libcrypto/ecdh/ecdh.h
index b4b58ee65b..8887102c0b 100644
--- a/src/lib/libcrypto/ecdh/ecdh.h
+++ b/src/lib/libcrypto/ecdh/ecdh.h
@@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void);
 /* Error codes for the ECDH functions. */
 
 /* Function codes. */
+#define ECDH_F_ECDH_CHECK				 102
 #define ECDH_F_ECDH_COMPUTE_KEY				 100
 #define ECDH_F_ECDH_DATA_NEW_METHOD			 101
 
 /* Reason codes. */
 #define ECDH_R_KDF_FAILED				 102
+#define ECDH_R_NON_FIPS_METHOD				 103
 #define ECDH_R_NO_PRIVATE_VALUE				 100
 #define ECDH_R_POINT_ARITHMETIC_FAILURE			 101
 
diff --git a/src/lib/libcrypto/ecdh/ech_err.c b/src/lib/libcrypto/ecdh/ech_err.c
index 6f4b0c9953..3bd247398d 100644
--- a/src/lib/libcrypto/ecdh/ech_err.c
+++ b/src/lib/libcrypto/ecdh/ech_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdh/ech_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDH_str_functs[]=
 	{
+{ERR_FUNC(ECDH_F_ECDH_CHECK),	"ECDH_CHECK"},
 {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY),	"ECDH_compute_key"},
 {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD),	"ECDH_DATA_new_method"},
 {0,NULL}
@@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]=
 static ERR_STRING_DATA ECDH_str_reasons[]=
 	{
 {ERR_REASON(ECDH_R_KDF_FAILED)           ,"KDF failed"},
+{ERR_REASON(ECDH_R_NON_FIPS_METHOD)      ,"non fips method"},
 {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE)     ,"no private value"},
 {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"},
 {0,NULL}
diff --git a/src/lib/libcrypto/ecdh/ech_lib.c b/src/lib/libcrypto/ecdh/ech_lib.c
index 4d8ea03d3d..dadbfd3c49 100644
--- a/src/lib/libcrypto/ecdh/ech_lib.c
+++ b/src/lib/libcrypto/ecdh/ech_lib.c
@@ -73,6 +73,9 @@
 #include <openssl/engine.h>
 #endif
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT;
 
@@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth)
 const ECDH_METHOD *ECDH_get_default_method(void)
 	{
 	if(!default_ECDH_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdh_openssl();
+		else
+			return ECDH_OpenSSL();
+#else
 		default_ECDH_method = ECDH_OpenSSL();
+#endif
+		}
 	return default_ECDH_method;
 	}
 
@@ -215,6 +227,14 @@ ECDH_DATA *ecdh_check(EC_KEY *key)
 	}
 	else
 		ecdh_data = (ECDH_DATA *)data;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 	
 
 	return ecdh_data;
diff --git a/src/lib/libcrypto/ecdh/ech_locl.h b/src/lib/libcrypto/ecdh/ech_locl.h
index f658526a7e..f6cad6a894 100644
--- a/src/lib/libcrypto/ecdh/ech_locl.h
+++ b/src/lib/libcrypto/ecdh/ech_locl.h
@@ -75,6 +75,14 @@ struct ecdh_method
 	char *app_data;
 	};
 
+/* If this flag is set the ECDH method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDH_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdh_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);
diff --git a/src/lib/libcrypto/ecdsa/ecdsa.h b/src/lib/libcrypto/ecdsa/ecdsa.h
index e61c539812..7fb5254b62 100644
--- a/src/lib/libcrypto/ecdsa/ecdsa.h
+++ b/src/lib/libcrypto/ecdsa/ecdsa.h
@@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void);
 /* Error codes for the ECDSA functions. */
 
 /* Function codes. */
+#define ECDSA_F_ECDSA_CHECK				 104
 #define ECDSA_F_ECDSA_DATA_NEW_METHOD			 100
 #define ECDSA_F_ECDSA_DO_SIGN				 101
 #define ECDSA_F_ECDSA_DO_VERIFY				 102
@@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void);
 #define ECDSA_R_ERR_EC_LIB				 102
 #define ECDSA_R_MISSING_PARAMETERS			 103
 #define ECDSA_R_NEED_NEW_SETUP_VALUES			 106
+#define ECDSA_R_NON_FIPS_METHOD				 107
 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED		 104
 #define ECDSA_R_SIGNATURE_MALLOC_FAILED			 105
 
diff --git a/src/lib/libcrypto/ecdsa/ecs_err.c b/src/lib/libcrypto/ecdsa/ecs_err.c
index 98e38d537f..81542e6d15 100644
--- a/src/lib/libcrypto/ecdsa/ecs_err.c
+++ b/src/lib/libcrypto/ecdsa/ecs_err.c
@@ -1,6 +1,6 @@
 /* crypto/ecdsa/ecs_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,6 +70,7 @@
 
 static ERR_STRING_DATA ECDSA_str_functs[]=
 	{
+{ERR_FUNC(ECDSA_F_ECDSA_CHECK),	"ECDSA_CHECK"},
 {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD),	"ECDSA_DATA_NEW_METHOD"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN),	"ECDSA_do_sign"},
 {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY),	"ECDSA_do_verify"},
@@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]=
 {ERR_REASON(ECDSA_R_ERR_EC_LIB)          ,"err ec lib"},
 {ERR_REASON(ECDSA_R_MISSING_PARAMETERS)  ,"missing parameters"},
 {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"},
+{ERR_REASON(ECDSA_R_NON_FIPS_METHOD)     ,"non fips method"},
 {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"},
 {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"},
 {0,NULL}
diff --git a/src/lib/libcrypto/ecdsa/ecs_lib.c b/src/lib/libcrypto/ecdsa/ecs_lib.c
index 2ebae3aa27..e477da430b 100644
--- a/src/lib/libcrypto/ecdsa/ecs_lib.c
+++ b/src/lib/libcrypto/ecdsa/ecs_lib.c
@@ -60,6 +60,9 @@
 #endif
 #include <openssl/err.h>
 #include <openssl/bn.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT;
 
@@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth)
 const ECDSA_METHOD *ECDSA_get_default_method(void)
 {
 	if(!default_ECDSA_method) 
+		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_ecdsa_openssl();
+		else
+			return ECDSA_OpenSSL();
+#else
 		default_ECDSA_method = ECDSA_OpenSSL();
+#endif
+		}
 	return default_ECDSA_method;
 }
 
@@ -193,7 +205,14 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key)
 	}
 	else
 		ecdsa_data = (ECDSA_DATA *)data;
-	
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD)
+			&& !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW))
+		{
+		ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD);
+		return NULL;
+		}
+#endif
 
 	return ecdsa_data;
 }
diff --git a/src/lib/libcrypto/ecdsa/ecs_locl.h b/src/lib/libcrypto/ecdsa/ecs_locl.h
index 3a69a840e2..cb3be13cfc 100644
--- a/src/lib/libcrypto/ecdsa/ecs_locl.h
+++ b/src/lib/libcrypto/ecdsa/ecs_locl.h
@@ -82,6 +82,14 @@ struct ecdsa_method
 	char *app_data;
 	};
 
+/* If this flag is set the ECDSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define ECDSA_FLAG_FIPS_METHOD	0x1
+
 typedef struct ecdsa_data_st {
 	/* EC_KEY_METH_DATA part */
 	int (*init)(EC_KEY *);
diff --git a/src/lib/libcrypto/ecdsa/ecs_ossl.c b/src/lib/libcrypto/ecdsa/ecs_ossl.c
index 1bbf328de5..7725935610 100644
--- a/src/lib/libcrypto/ecdsa/ecs_ossl.c
+++ b/src/lib/libcrypto/ecdsa/ecs_ossl.c
@@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 				goto err;
 			}
 		}
+#ifndef OPENSSL_NO_EC2M
 		else /* NID_X9_62_characteristic_two_field */
 		{
 			if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp,
 				goto err;
 			}
 		}
+#endif
 		if (!BN_nnmod(r, X, order, ctx))
 		{
 			ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
@@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
 			goto err;
 		}
 	}
+#ifndef OPENSSL_NO_EC2M
 	else /* NID_X9_62_characteristic_two_field */
 	{
 		if (!EC_POINT_get_affine_coordinates_GF2m(group,
@@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
 			goto err;
 		}
 	}
-	
+#endif	
 	if (!BN_nnmod(u1, X, order, ctx))
 	{
 		ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
diff --git a/src/lib/libcrypto/engine/eng_all.c b/src/lib/libcrypto/engine/eng_all.c
index 22c120454f..6093376df4 100644
--- a/src/lib/libcrypto/engine/eng_all.c
+++ b/src/lib/libcrypto/engine/eng_all.c
@@ -61,6 +61,8 @@
 
 void ENGINE_load_builtin_engines(void)
 	{
+	/* Some ENGINEs need this */
+	OPENSSL_cpuid_setup();
 #if 0
 	/* There's no longer any need for an "openssl" ENGINE unless, one day,
 	 * it is the *only* way for standard builtin implementations to be be
@@ -70,6 +72,12 @@ void ENGINE_load_builtin_engines(void)
 #endif
 #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV))
 	ENGINE_load_cryptodev();
+#endif
+#ifndef OPENSSL_NO_RSAX
+	ENGINE_load_rsax();
+#endif
+#ifndef OPENSSL_NO_RDRAND
+	ENGINE_load_rdrand();
 #endif
 	ENGINE_load_dynamic();
 #ifndef OPENSSL_NO_STATIC_ENGINE
@@ -112,6 +120,7 @@ void ENGINE_load_builtin_engines(void)
 	ENGINE_load_capi();
 #endif
 #endif
+	ENGINE_register_all_complete();
 	}
 
 #if defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)
diff --git a/src/lib/libcrypto/engine/eng_fat.c b/src/lib/libcrypto/engine/eng_fat.c
index db66e62350..789b8d57e5 100644
--- a/src/lib/libcrypto/engine/eng_fat.c
+++ b/src/lib/libcrypto/engine/eng_fat.c
@@ -176,6 +176,7 @@ int ENGINE_register_all_complete(void)
 	ENGINE *e;
 
 	for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e))
-		ENGINE_register_complete(e);
+		if (!(e->flags & ENGINE_FLAGS_NO_REGISTER_ALL))
+			ENGINE_register_complete(e);
 	return 1;
 	}
diff --git a/src/lib/libcrypto/engine/engine.h b/src/lib/libcrypto/engine/engine.h
index 943aeae215..f8be497724 100644
--- a/src/lib/libcrypto/engine/engine.h
+++ b/src/lib/libcrypto/engine/engine.h
@@ -141,6 +141,13 @@ extern "C" {
  * the existing ENGINE's structural reference count. */
 #define ENGINE_FLAGS_BY_ID_COPY		(int)0x0004
 
+/* This flag if for an ENGINE that does not want its methods registered as 
+ * part of ENGINE_register_all_complete() for example if the methods are
+ * not usable as default methods.
+ */
+
+#define ENGINE_FLAGS_NO_REGISTER_ALL	(int)0x0008
+
 /* ENGINEs can support their own command types, and these flags are used in
  * ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each
  * command expects. Currently only numeric and string input is supported. If a
@@ -344,6 +351,8 @@ void ENGINE_load_gost(void);
 #endif
 #endif
 void ENGINE_load_cryptodev(void);
+void ENGINE_load_rsax(void);
+void ENGINE_load_rdrand(void);
 void ENGINE_load_builtin_engines(void);
 
 /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation
diff --git a/src/lib/libcrypto/err/err.c b/src/lib/libcrypto/err/err.c
index 69713a6e2f..fcdb244008 100644
--- a/src/lib/libcrypto/err/err.c
+++ b/src/lib/libcrypto/err/err.c
@@ -1066,6 +1066,13 @@ void ERR_set_error_data(char *data, int flags)
 void ERR_add_error_data(int num, ...)
 	{
 	va_list args;
+	va_start(args, num);
+	ERR_add_error_vdata(num, args);
+	va_end(args);
+	}
+
+void ERR_add_error_vdata(int num, va_list args)
+	{
 	int i,n,s;
 	char *str,*p,*a;
 
@@ -1074,7 +1081,6 @@ void ERR_add_error_data(int num, ...)
 	if (str == NULL) return;
 	str[0]='\0';
 
-	va_start(args, num);
 	n=0;
 	for (i=0; i<num; i++)
 		{
@@ -1090,7 +1096,7 @@ void ERR_add_error_data(int num, ...)
 				if (p == NULL)
 					{
 					OPENSSL_free(str);
-					goto err;
+					return;
 					}
 				else
 					str=p;
@@ -1099,9 +1105,6 @@ void ERR_add_error_data(int num, ...)
 			}
 		}
 	ERR_set_error_data(str,ERR_TXT_MALLOCED|ERR_TXT_STRING);
-
-err:
-	va_end(args);
 	}
 
 int ERR_set_mark(void)
diff --git a/src/lib/libcrypto/err/err.h b/src/lib/libcrypto/err/err.h
index b9f8c16d47..974cc9cc6f 100644
--- a/src/lib/libcrypto/err/err.h
+++ b/src/lib/libcrypto/err/err.h
@@ -344,8 +344,9 @@ void ERR_print_errors_fp(FILE *fp);
 #endif
 #ifndef OPENSSL_NO_BIO
 void ERR_print_errors(BIO *bp);
-void ERR_add_error_data(int num, ...);
 #endif
+void ERR_add_error_data(int num, ...);
+void ERR_add_error_vdata(int num, va_list args);
 void ERR_load_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_unload_strings(int lib,ERR_STRING_DATA str[]);
 void ERR_load_ERR_strings(void);
diff --git a/src/lib/libcrypto/err/err_all.c b/src/lib/libcrypto/err/err_all.c
index fc049e8e88..bd8946d8ba 100644
--- a/src/lib/libcrypto/err/err_all.c
+++ b/src/lib/libcrypto/err/err_all.c
@@ -104,6 +104,10 @@
 #endif
 #include <openssl/comp.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 void ERR_load_crypto_strings(void)
 	{
 #ifndef OPENSSL_NO_ERR
@@ -156,5 +160,8 @@ void ERR_load_crypto_strings(void)
 	ERR_load_JPAKE_strings();
 #endif
 	ERR_load_COMP_strings();
+#endif
+#ifdef OPENSSL_FIPS
+	ERR_load_FIPS_strings();
 #endif
 	}
diff --git a/src/lib/libcrypto/evp/bio_md.c b/src/lib/libcrypto/evp/bio_md.c
index 9841e32e1a..144fdfd56a 100644
--- a/src/lib/libcrypto/evp/bio_md.c
+++ b/src/lib/libcrypto/evp/bio_md.c
@@ -153,8 +153,12 @@ static int md_write(BIO *b, const char *in, int inl)
 		{
 		if (ret > 0)
 			{
-			EVP_DigestUpdate(ctx,(const unsigned char *)in,
-				(unsigned int)ret);
+			if (!EVP_DigestUpdate(ctx,(const unsigned char *)in,
+				(unsigned int)ret))
+				{
+				BIO_clear_retry_flags(b);
+				return 0;
+				}
 			}
 		}
 	if(b->next_bio != NULL)
@@ -220,7 +224,8 @@ static long md_ctrl(BIO *b, int cmd, long num, void *ptr)
 	case BIO_CTRL_DUP:
 		dbio=ptr;
 		dctx=dbio->ptr;
-		EVP_MD_CTX_copy_ex(dctx,ctx);
+		if (!EVP_MD_CTX_copy_ex(dctx,ctx))
+			return 0;
 		b->init=1;
 		break;
 	default:
diff --git a/src/lib/libcrypto/evp/digest.c b/src/lib/libcrypto/evp/digest.c
index 982ba2b136..467e6b5ae9 100644
--- a/src/lib/libcrypto/evp/digest.c
+++ b/src/lib/libcrypto/evp/digest.c
@@ -117,6 +117,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 void EVP_MD_CTX_init(EVP_MD_CTX *ctx)
 	{
 	memset(ctx,'\0',sizeof *ctx);
@@ -225,12 +229,26 @@ skip_to_init:
 		}
 	if (ctx->flags & EVP_MD_CTX_FLAG_NO_INIT)
 		return 1;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		if (FIPS_digestinit(ctx, type))
+			return 1;
+		OPENSSL_free(ctx->md_data);
+		ctx->md_data = NULL;
+		return 0;
+		}
+#endif
 	return ctx->digest->init(ctx);
 	}
 
 int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestupdate(ctx, data, count);
+#else
 	return ctx->update(ctx,data,count);
+#endif
 	}
 
 /* The caller can assume that this removes any secret data from the context */
@@ -245,8 +263,10 @@ int EVP_DigestFinal(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 /* The caller can assume that this removes any secret data from the context */
 int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 	{
+#ifdef OPENSSL_FIPS
+	return FIPS_digestfinal(ctx, md, size);
+#else
 	int ret;
-
 	OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE);
 	ret=ctx->digest->final(ctx,md);
 	if (size != NULL)
@@ -258,6 +278,7 @@ int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size)
 		}
 	memset(ctx->md_data,0,ctx->digest->ctx_size);
 	return ret;
+#endif
 	}
 
 int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in)
@@ -351,6 +372,7 @@ void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx)
 /* This call frees resources associated with the context */
 int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 	{
+#ifndef OPENSSL_FIPS
 	/* Don't assume ctx->md_data was cleaned in EVP_Digest_Final,
 	 * because sometimes only copies of the context are ever finalised.
 	 */
@@ -363,6 +385,7 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 		OPENSSL_cleanse(ctx->md_data,ctx->digest->ctx_size);
 		OPENSSL_free(ctx->md_data);
 		}
+#endif
 	if (ctx->pctx)
 		EVP_PKEY_CTX_free(ctx->pctx);
 #ifndef OPENSSL_NO_ENGINE
@@ -370,6 +393,9 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx)
 		/* The EVP_MD we used belongs to an ENGINE, release the
 		 * functional reference we held for this reason. */
 		ENGINE_finish(ctx->engine);
+#endif
+#ifdef OPENSSL_FIPS
+	FIPS_md_ctx_cleanup(ctx);
 #endif
 	memset(ctx,'\0',sizeof *ctx);
 
diff --git a/src/lib/libcrypto/evp/e_aes.c b/src/lib/libcrypto/evp/e_aes.c
index bd6c0a3a62..1e4af0cb75 100644
--- a/src/lib/libcrypto/evp/e_aes.c
+++ b/src/lib/libcrypto/evp/e_aes.c
@@ -1,5 +1,5 @@
 /* ====================================================================
- * Copyright (c) 2001 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 2001-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -56,57 +56,511 @@
 #include <assert.h>
 #include <openssl/aes.h>
 #include "evp_locl.h"
-
-static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
-					const unsigned char *iv, int enc);
+#ifndef OPENSSL_FIPS
+#include "modes_lcl.h"
+#include <openssl/rand.h>
 
 typedef struct
 	{
 	AES_KEY ks;
+	block128_f block;
+	union {
+		cbc128_f cbc;
+		ctr128_f ctr;
+	} stream;
 	} EVP_AES_KEY;
 
-#define data(ctx)	EVP_C_DATA(EVP_AES_KEY,ctx)
-
-IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY,
-		       NID_aes_128, 16, 16, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_192, ks, AES, EVP_AES_KEY,
-		       NID_aes_192, 16, 24, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-IMPLEMENT_BLOCK_CIPHER(aes_256, ks, AES, EVP_AES_KEY,
-		       NID_aes_256, 16, 32, 16, 128,
-		       0, aes_init_key, NULL, 
-		       EVP_CIPHER_set_asn1_iv,
-		       EVP_CIPHER_get_asn1_iv,
-		       NULL)
-
-#define IMPLEMENT_AES_CFBR(ksize,cbits)	IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16)
-
-IMPLEMENT_AES_CFBR(128,1)
-IMPLEMENT_AES_CFBR(192,1)
-IMPLEMENT_AES_CFBR(256,1)
-
-IMPLEMENT_AES_CFBR(128,8)
-IMPLEMENT_AES_CFBR(192,8)
-IMPLEMENT_AES_CFBR(256,8)
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	GCM128_CONTEXT gcm;
+	unsigned char *iv;	/* Temporary IV store */
+	int ivlen;		/* IV length */
+	int taglen;
+	int iv_gen;		/* It is OK to generate IVs */
+	int tls_aad_len;	/* TLS AAD length */
+	ctr128_f ctr;
+	} EVP_AES_GCM_CTX;
+
+typedef struct
+	{
+	AES_KEY ks1, ks2;	/* AES key schedules to use */
+	XTS128_CONTEXT xts;
+	void     (*stream)(const unsigned char *in,
+			unsigned char *out, size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+	} EVP_AES_XTS_CTX;
+
+typedef struct
+	{
+	AES_KEY ks;		/* AES key schedule to use */
+	int key_set;		/* Set if key initialised */
+	int iv_set;		/* Set if an iv is set */
+	int tag_set;		/* Set if tag is valid */
+	int len_set;		/* Set if message length set */
+	int L, M;		/* L and M parameters from RFC3610 */
+	CCM128_CONTEXT ccm;
+	ccm128_f str;
+	} EVP_AES_CCM_CTX;
+
+#define MAXBITCHUNK	((size_t)1<<(sizeof(size_t)*8-4))
+
+#ifdef VPAES_ASM
+int vpaes_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int vpaes_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void vpaes_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void vpaes_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void vpaes_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+#endif
+#ifdef BSAES_ASM
+void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out,
+			size_t length, const AES_KEY *key,
+			unsigned char ivec[16], int enc);
+void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+			size_t len, const AES_KEY *key,
+			const unsigned char ivec[16]);
+void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out,
+			size_t len, const AES_KEY *key1,
+			const AES_KEY *key2, const unsigned char iv[16]);
+#endif
+#ifdef AES_CTR_ASM
+void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
+			size_t blocks, const AES_KEY *key,
+			const unsigned char ivec[AES_BLOCK_SIZE]);
+#endif
+#ifdef AES_XTS_ASM
+void AES_xts_encrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+void AES_xts_decrypt(const char *inp,char *out,size_t len,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+#endif
+
+#if	defined(AES_ASM) && !defined(I386_ONLY) &&	(  \
+	((defined(__i386)	|| defined(__i386__)	|| \
+	  defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)				)
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+#ifdef VPAES_ASM
+#define VPAES_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(41-32)))
+#endif
+#ifdef BSAES_ASM
+#define BSAES_CAPABLE	VPAES_CAPABLE
+#endif
+/*
+ * AES-NI section
+ */
+#define	AESNI_CAPABLE	(OPENSSL_ia32cap_P[1]&(1<<(57-32)))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			AES_KEY *key);
+
+void aesni_encrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+void aesni_decrypt(const unsigned char *in, unsigned char *out,
+			const AES_KEY *key);
+
+void aesni_ecb_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			int enc);
+void aesni_cbc_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key,
+			unsigned char *ivec, int enc);
+
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char *ivec);
+
+void aesni_xts_encrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_xts_decrypt(const unsigned char *in,
+			unsigned char *out,
+			size_t length,
+			const AES_KEY *key1, const AES_KEY *key2,
+			const unsigned char iv[16]);
+
+void aesni_ccm64_encrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+void aesni_ccm64_decrypt_blocks (const unsigned char *in,
+			unsigned char *out,
+			size_t blocks,
+			const void *key,
+			const unsigned char ivec[16],
+			unsigned char cmac[16]);
+
+static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+		   const unsigned char *iv, int enc)
+	{
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+		{ 
+		ret = aesni_set_decrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)aesni_cbc_encrypt :
+					NULL;
+		}
+	else	{
+		ret = aesni_set_encrypt_key(key, ctx->key_len*8, ctx->cipher_data);
+		dat->block	= (block128_f)aesni_encrypt;
+		if (mode==EVP_CIPH_CBC_MODE)
+			dat->stream.cbc	= (cbc128_f)aesni_cbc_encrypt;
+		else if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		else
+			dat->stream.cbc = NULL;
+		}
+
+	if(ret < 0)
+		{
+		EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
+		return 0;
+		}
+
+	return 1;
+	}
+
+static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	aesni_cbc_encrypt(in,out,len,ctx->cipher_data,ctx->iv,ctx->encrypt);
+
+	return 1;
+}
+
+static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+
+	if (len<bl)	return 1;
+
+	aesni_ecb_encrypt(in,out,len,ctx->cipher_data,ctx->encrypt);
+
+	return 1;
+}
+
+#define aesni_ofb_cipher aes_ofb_cipher
+static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb_cipher aes_cfb_cipher
+static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb8_cipher aes_cfb8_cipher
+static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_cfb1_cipher aes_cfb1_cipher
+static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len);
+
+#define aesni_ctr_cipher aes_ctr_cipher
+static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks,
+				(block128_f)aesni_encrypt);
+		gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks;
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+#define aesni_gcm_cipher aes_gcm_cipher
+static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key)
+		{
+		/* key_len is two AES keys */
+		if (enc)
+			{
+			aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_encrypt;
+			xctx->stream = aesni_xts_encrypt;
+			}
+		else
+			{
+			aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)aesni_decrypt;
+			xctx->stream = aesni_xts_decrypt;
+			}
+
+		aesni_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)aesni_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		}
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+#define aesni_xts_cipher aes_xts_cipher
+static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{
+		aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)aesni_encrypt);
+		cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks :
+				(ccm128_f)aesni_ccm64_decrypt_blocks;
+		cctx->key_set = 1;
+		}
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+#define aesni_ccm_cipher aes_ccm_cipher
+static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len);
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_init_key,			\
+	aesni_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,	\
+	keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aesni_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aesni_##mode##_init_key,	\
+	aesni_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; }
+
+#else
+
+#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_init_key,			\
+	aes_##mode##_cipher,		\
+	NULL,				\
+	sizeof(EVP_AES_KEY),		\
+	NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+
+#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \
+static const EVP_CIPHER aes_##keylen##_##mode = { \
+	nid##_##keylen##_##mode,blocksize, \
+	(EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \
+	flags|EVP_CIPH_##MODE##_MODE,	\
+	aes_##mode##_init_key,		\
+	aes_##mode##_cipher,		\
+	aes_##mode##_cleanup,		\
+	sizeof(EVP_AES_##MODE##_CTX),	\
+	NULL,NULL,aes_##mode##_ctrl,NULL }; \
+const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \
+{ return &aes_##keylen##_##mode; }
+#endif
+
+#define BLOCK_CIPHER_generic_pack(nid,keylen,flags)		\
+	BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags)	\
+	BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags)
 
 static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 		   const unsigned char *iv, int enc)
 	{
-	int ret;
+	int ret, mode;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
 
-	if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
-	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
-	    || enc) 
-		ret=AES_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+	mode = ctx->cipher->flags & EVP_CIPH_MODE;
+	if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+	    && !enc)
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE)
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= (cbc128_f)bsaes_cbc_encrypt;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_decrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+		}
 	else
-		ret=AES_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data);
+#ifdef BSAES_CAPABLE
+	    if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE)
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.ctr	= (ctr128_f)bsaes_ctr32_encrypt_blocks;
+		}
+	    else
+#endif
+#ifdef VPAES_CAPABLE
+	    if (VPAES_CAPABLE)
+		{
+		ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)vpaes_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)vpaes_cbc_encrypt :
+					NULL;
+		}
+	    else
+#endif
+		{
+		ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks);
+		dat->block	= (block128_f)AES_encrypt;
+		dat->stream.cbc	= mode==EVP_CIPH_CBC_MODE ?
+					(cbc128_f)AES_cbc_encrypt :
+					NULL;
+#ifdef AES_CTR_ASM
+		if (mode==EVP_CIPH_CTR_MODE)
+			dat->stream.ctr = (ctr128_f)AES_ctr32_encrypt;
+#endif
+		}
 
 	if(ret < 0)
 		{
@@ -117,4 +571,743 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	return 1;
 	}
 
+static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.cbc)
+		(*dat->stream.cbc)(in,out,len,&dat->ks,ctx->iv,ctx->encrypt);
+	else if (ctx->encrypt)
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+	else
+		CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block);
+
+	return 1;
+}
+
+static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in, size_t len)
+{
+	size_t	bl = ctx->cipher->block_size;
+	size_t	i;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (len<bl)	return 1;
+
+	for (i=0,len-=bl;i<=len;i+=bl)
+		(*dat->block)(in+i,out+i,&dat->ks);
+
+	return 1;
+}
+
+static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_ofb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,dat->block);
+	return 1;
+}
+
+static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	CRYPTO_cfb128_8_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	return 1;
+}
+
+static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out,
+	const unsigned char *in,size_t len)
+{
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (ctx->flags&EVP_CIPH_FLAG_LENGTH_BITS) {
+		CRYPTO_cfb128_1_encrypt(in,out,len,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		return 1;
+	}
+
+	while (len>=MAXBITCHUNK) {
+		CRYPTO_cfb128_1_encrypt(in,out,MAXBITCHUNK*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+		len-=MAXBITCHUNK;
+	}
+	if (len)
+		CRYPTO_cfb128_1_encrypt(in,out,len*8,&dat->ks,
+			ctx->iv,&ctx->num,ctx->encrypt,dat->block);
+	
+	return 1;
+}
+
+static int aes_ctr_cipher (EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+{
+	unsigned int num = ctx->num;
+	EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data;
+
+	if (dat->stream.ctr)
+		CRYPTO_ctr128_encrypt_ctr32(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->stream.ctr);
+	else
+		CRYPTO_ctr128_encrypt(in,out,len,&dat->ks,
+			ctx->iv,ctx->buf,&num,dat->block);
+	ctx->num = (size_t)num;
+	return 1;
+}
+
+BLOCK_CIPHER_generic_pack(NID_aes,128,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,192,EVP_CIPH_FLAG_FIPS)
+BLOCK_CIPHER_generic_pack(NID_aes,256,EVP_CIPH_FLAG_FIPS)
+
+static int aes_gcm_cleanup(EVP_CIPHER_CTX *c)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm));
+	if (gctx->iv != c->iv)
+		OPENSSL_free(gctx->iv);
+	return 1;
+	}
+
+/* increment counter (64-bit int) by 1 */
+static void ctr64_inc(unsigned char *counter) {
+	int n=8;
+	unsigned char  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_GCM_CTX *gctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		gctx->key_set = 0;
+		gctx->iv_set = 0;
+		gctx->ivlen = c->cipher->iv_len;
+		gctx->iv = c->iv;
+		gctx->taglen = -1;
+		gctx->iv_gen = 0;
+		gctx->tls_aad_len = -1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IVLEN:
+		if (arg <= 0)
+			return 0;
+#ifdef OPENSSL_FIPS
+		if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)
+						 && arg < 12)
+			return 0;
+#endif
+		/* Allocate memory for IV if needed */
+		if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen))
+			{
+			if (gctx->iv != c->iv)
+				OPENSSL_free(gctx->iv);
+			gctx->iv = OPENSSL_malloc(arg);
+			if (!gctx->iv)
+				return 0;
+			}
+		gctx->ivlen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_TAG:
+		if (arg <= 0 || arg > 16 || c->encrypt)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->taglen = arg;
+		return 1;
+
+	case EVP_CTRL_GCM_GET_TAG:
+		if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0)
+			return 0;
+		memcpy(ptr, c->buf, arg);
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_FIXED:
+		/* Special case: -1 length restores whole IV */
+		if (arg == -1)
+			{
+			memcpy(gctx->iv, ptr, gctx->ivlen);
+			gctx->iv_gen = 1;
+			return 1;
+			}
+		/* Fixed field must be at least 4 bytes and invocation field
+		 * at least 8.
+		 */
+		if ((arg < 4) || (gctx->ivlen - arg) < 8)
+			return 0;
+		if (arg)
+			memcpy(gctx->iv, ptr, arg);
+		if (c->encrypt &&
+			RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0)
+			return 0;
+		gctx->iv_gen = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_IV_GEN:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0)
+			return 0;
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		if (arg <= 0 || arg > gctx->ivlen)
+			arg = gctx->ivlen;
+		memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg);
+		/* Invocation field will be at least 8 bytes in size and
+		 * so no need to check wrap around or increment more than
+		 * last 8 bytes.
+		 */
+		ctr64_inc(gctx->iv + gctx->ivlen - 8);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_GCM_SET_IV_INV:
+		if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt)
+			return 0;
+		memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg);
+		CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		return 1;
+
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		/* Save the AAD for later use */
+		if (arg != 13)
+			return 0;
+		memcpy(c->buf, ptr, arg);
+		gctx->tls_aad_len = arg;
+			{
+			unsigned int len=c->buf[arg-2]<<8|c->buf[arg-1];
+			/* Correct length for explicit IV */
+			len -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			/* If decrypting correct for tag too */
+			if (!c->encrypt)
+				len -= EVP_GCM_TLS_TAG_LEN;
+                        c->buf[arg-2] = len>>8;
+                        c->buf[arg-1] = len & 0xff;
+			}
+		/* Extra padding: tag appended to record */
+		return EVP_GCM_TLS_TAG_LEN;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key)
+		{ do {
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			{
+			AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)AES_encrypt);
+			gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks;
+			break;
+			}
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks);
+			CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks,
+					(block128_f)vpaes_encrypt);
+			gctx->ctr = NULL;
+			break;
+			}
+#endif
+		AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks);
+		CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt);
+#ifdef AES_CTR_ASM
+		gctx->ctr = (ctr128_f)AES_ctr32_encrypt;
+#else
+		gctx->ctr = NULL;
+#endif
+		} while (0);
+
+		/* If we have an iv can set it directly, otherwise use
+		 * saved IV.
+		 */
+		if (iv == NULL && gctx->iv_set)
+			iv = gctx->iv;
+		if (iv)
+			{
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+			gctx->iv_set = 1;
+			}
+		gctx->key_set = 1;
+		}
+	else
+		{
+		/* If key set use IV, otherwise copy */
+		if (gctx->key_set)
+			CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen);
+		else
+			memcpy(gctx->iv, iv, gctx->ivlen);
+		gctx->iv_set = 1;
+		gctx->iv_gen = 0;
+		}
+	return 1;
+	}
+
+/* Handle TLS GCM packet format. This consists of the last portion of the IV
+ * followed by the payload and finally the tag. On encrypt generate IV,
+ * encrypt payload and write the tag. On verify retrieve IV, decrypt payload
+ * and verify tag.
+ */
+
+static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	int rv = -1;
+	/* Encrypt/decrypt must be performed in place */
+	if (out != in || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN+EVP_GCM_TLS_TAG_LEN))
+		return -1;
+	/* Set IV from start of buffer or generate IV and write to start
+	 * of buffer.
+	 */
+	if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ?
+				EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV,
+				EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0)
+		goto err;
+	/* Use saved AAD */
+	if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len))
+		goto err;
+	/* Fix buffer and length to point to payload */
+	in += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	out += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+	len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+	if (ctx->encrypt)
+		{
+		/* Encrypt payload */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		out += len;
+		/* Finally write tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN);
+		rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN;
+		}
+	else
+		{
+		/* Decrypt */
+		if (gctx->ctr)
+			{
+			if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+				goto err;
+			}
+		else	{
+			if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+				goto err;
+			}
+		/* Retrieve tag */
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf,
+					EVP_GCM_TLS_TAG_LEN);
+		/* If tag mismatch wipe buffer */
+		if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN))
+			{
+			OPENSSL_cleanse(out, len);
+			goto err;
+			}
+		rv = len;
+		}
+
+	err:
+	gctx->iv_set = 0;
+	gctx->tls_aad_len = -1;
+	return rv;
+	}
+
+static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_GCM_CTX *gctx = ctx->cipher_data;
+	/* If not set up, return error */
+	if (!gctx->key_set)
+		return -1;
+
+	if (gctx->tls_aad_len >= 0)
+		return aes_gcm_tls_cipher(ctx, out, in, len);
+
+	if (!gctx->iv_set)
+		return -1;
+	if (!ctx->encrypt && gctx->taglen < 0)
+		return -1;
+	if (in)
+		{
+		if (out == NULL)
+			{
+			if (CRYPTO_gcm128_aad(&gctx->gcm, in, len))
+				return -1;
+			}
+		else if (ctx->encrypt)
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		else
+			{
+			if (gctx->ctr)
+				{
+				if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm,
+							in, out, len,
+							gctx->ctr))
+					return -1;
+				}
+			else	{
+				if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len))
+					return -1;
+				}
+			}
+		return len;
+		}
+	else
+		{
+		if (!ctx->encrypt)
+			{
+			if (CRYPTO_gcm128_finish(&gctx->gcm,
+					ctx->buf, gctx->taglen) != 0)
+				return -1;
+			gctx->iv_set = 0;
+			return 0;
+			}
+		CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16);
+		gctx->taglen = 16;
+		/* Don't reuse the IV */
+		gctx->iv_set = 0;
+		return 0;
+		}
+
+	}
+
+#define CUSTOM_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 \
+		| EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \
+		| EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM,
+		EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS)
+
+static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_XTS_CTX *xctx = c->cipher_data;
+	if (type != EVP_CTRL_INIT)
+		return -1;
+	/* key1 and key2 are used as an indicator both key and IV are set */
+	xctx->xts.key1 = NULL;
+	xctx->xts.key2 = NULL;
+	return 1;
+	}
+
+static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+
+	if (key) do
+		{
+#ifdef AES_XTS_ASM
+		xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt;
+#else
+		xctx->stream = NULL;
+#endif
+		/* key_len is two AES keys */
+#ifdef BSAES_CAPABLE
+		if (BSAES_CAPABLE)
+			xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt;
+		else
+#endif
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+		    {
+		    if (enc)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_encrypt;
+			}
+		    else
+			{
+			vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)vpaes_decrypt;
+			}
+
+		vpaes_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)vpaes_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		break;
+		}
+#endif
+		if (enc)
+			{
+			AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_encrypt;
+			}
+		else
+			{
+			AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1);
+			xctx->xts.block1 = (block128_f)AES_decrypt;
+			}
+
+		AES_set_encrypt_key(key + ctx->key_len/2,
+						ctx->key_len * 4, &xctx->ks2);
+		xctx->xts.block2 = (block128_f)AES_encrypt;
+
+		xctx->xts.key1 = &xctx->ks1;
+		} while (0);
+
+	if (iv)
+		{
+		xctx->xts.key2 = &xctx->ks2;
+		memcpy(ctx->iv, iv, 16);
+		}
+
+	return 1;
+	}
+
+static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_XTS_CTX *xctx = ctx->cipher_data;
+	if (!xctx->xts.key1 || !xctx->xts.key2)
+		return 0;
+	if (!out || !in || len<AES_BLOCK_SIZE)
+		return 0;
+#ifdef OPENSSL_FIPS
+	/* Requirement of SP800-38E */
+	if (FIPS_module_mode() && !(ctx->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) &&
+			(len > (1UL<<20)*16))
+		{
+		EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE);
+		return 0;
+		}
+#endif
+	if (xctx->stream)
+		(*xctx->stream)(in, out, len,
+				xctx->xts.key1, xctx->xts.key2, ctx->iv);
+	else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len,
+								ctx->encrypt))
+		return 0;
+	return 1;
+	}
+
+#define aes_xts_cleanup NULL
+
+#define XTS_FLAGS	(EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \
+			 | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT)
+
+BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS)
+
+static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr)
+	{
+	EVP_AES_CCM_CTX *cctx = c->cipher_data;
+	switch (type)
+		{
+	case EVP_CTRL_INIT:
+		cctx->key_set = 0;
+		cctx->iv_set = 0;
+		cctx->L = 8;
+		cctx->M = 12;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_IVLEN:
+		arg = 15 - arg;
+	case EVP_CTRL_CCM_SET_L:
+		if (arg < 2 || arg > 8)
+			return 0;
+		cctx->L = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_SET_TAG:
+		if ((arg & 1) || arg < 4 || arg > 16)
+			return 0;
+		if ((c->encrypt && ptr) || (!c->encrypt && !ptr))
+			return 0;
+		if (ptr)
+			{
+			cctx->tag_set = 1;
+			memcpy(c->buf, ptr, arg);
+			}
+		cctx->M = arg;
+		return 1;
+
+	case EVP_CTRL_CCM_GET_TAG:
+		if (!c->encrypt || !cctx->tag_set)
+			return 0;
+		if(!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg))
+			return 0;
+		cctx->tag_set = 0;
+		cctx->iv_set = 0;
+		cctx->len_set = 0;
+		return 1;
+
+	default:
+		return -1;
+
+		}
+	}
+
+static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                        const unsigned char *iv, int enc)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	if (!iv && !key)
+		return 1;
+	if (key) do
+		{
+#ifdef VPAES_CAPABLE
+		if (VPAES_CAPABLE)
+			{
+			vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks);
+			CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)vpaes_encrypt);
+			cctx->key_set = 1;
+			break;
+			}
+#endif
+		AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks);
+		CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L,
+					&cctx->ks, (block128_f)AES_encrypt);
+		cctx->str = NULL;
+		cctx->key_set = 1;
+		} while (0);
+	if (iv)
+		{
+		memcpy(ctx->iv, iv, 15 - cctx->L);
+		cctx->iv_set = 1;
+		}
+	return 1;
+	}
+
+static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+	{
+	EVP_AES_CCM_CTX *cctx = ctx->cipher_data;
+	CCM128_CONTEXT *ccm = &cctx->ccm;
+	/* If not set up, return error */
+	if (!cctx->iv_set && !cctx->key_set)
+		return -1;
+	if (!ctx->encrypt && !cctx->tag_set)
+		return -1;
+	if (!out)
+		{
+		if (!in)
+			{
+			if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L,len))
+				return -1;
+			cctx->len_set = 1;
+			return len;
+			}
+		/* If have AAD need message length */
+		if (!cctx->len_set && len)
+			return -1;
+		CRYPTO_ccm128_aad(ccm, in, len);
+		return len;
+		}
+	/* EVP_*Final() doesn't return any data */
+	if (!in)
+		return 0;
+	/* If not set length yet do it */
+	if (!cctx->len_set)
+		{
+		if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len))
+			return -1;
+		cctx->len_set = 1;
+		}
+	if (ctx->encrypt)
+		{
+		if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				CRYPTO_ccm128_encrypt(ccm, in, out, len))
+			return -1;
+		cctx->tag_set = 1;
+		return len;
+		}
+	else
+		{
+		int rv = -1;
+		if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len,
+						cctx->str) :
+				!CRYPTO_ccm128_decrypt(ccm, in, out, len))
+			{
+			unsigned char tag[16];
+			if (CRYPTO_ccm128_tag(ccm, tag, cctx->M))
+				{
+				if (!memcmp(tag, ctx->buf, cctx->M))
+					rv = len;
+				}
+			}
+		if (rv == -1)
+			OPENSSL_cleanse(out, len);
+		cctx->iv_set = 0;
+		cctx->tag_set = 0;
+		cctx->len_set = 0;
+		return rv;
+		}
+
+	}
+
+#define aes_ccm_cleanup NULL
+
+BLOCK_CIPHER_custom(NID_aes,128,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS)
+
+#endif
 #endif
diff --git a/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c
new file mode 100644
index 0000000000..710fb79baf
--- /dev/null
+++ b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c
@@ -0,0 +1,406 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/aes.h>
+#include <openssl/sha.h>
+#include "evp_locl.h"
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1)
+#define EVP_CIPH_FLAG_DEFAULT_ASN1 0
+#endif
+
+#define TLS1_1_VERSION 0x0302
+
+typedef struct
+    {
+    AES_KEY		ks;
+    SHA_CTX		head,tail,md;
+    size_t		payload_length;	/* AAD length in decrypt case */
+    union {
+	unsigned int	tls_ver;
+    	unsigned char	tls_aad[16];	/* 13 used */
+    } aux;
+    } EVP_AES_HMAC_SHA1;
+
+#define NO_PAYLOAD_LENGTH	((size_t)-1)
+
+#if	defined(AES_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)	)
+
+extern unsigned int OPENSSL_ia32cap_P[2];
+#define AESNI_CAPABLE   (1<<(57-32))
+
+int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
+			      AES_KEY *key);
+
+void aesni_cbc_encrypt(const unsigned char *in,
+			   unsigned char *out,
+			   size_t length,
+			   const AES_KEY *key,
+			   unsigned char *ivec, int enc);
+
+void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks,
+		const AES_KEY *key, unsigned char iv[16],
+		SHA_CTX *ctx,const void *in0);
+
+#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data)
+
+static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	int ret;
+
+	if (enc)
+		ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks);
+	else
+		ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks);
+
+	SHA1_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return ret<0?0:1;
+	}
+
+#define	STITCHED_CALL
+
+#if !defined(STITCHED_CALL)
+#define	aes_off 0
+#endif
+
+void sha1_block_data_order (void *c,const void *p,size_t len);
+
+static void sha1_update(SHA_CTX *c,const void *data,size_t len)
+{	const unsigned char *ptr = data;
+	size_t res;
+
+	if ((res = c->num)) {
+		res = SHA_CBLOCK-res;
+		if (len<res) res=len;
+		SHA1_Update (c,ptr,res);
+		ptr += res;
+		len -= res;
+	}
+
+	res = len % SHA_CBLOCK;
+	len -= res;
+
+	if (len) {
+		sha1_block_data_order(c,ptr,len/SHA_CBLOCK);
+
+		ptr += len;
+		c->Nh += len>>29;
+		c->Nl += len<<=3;
+		if (c->Nl<(unsigned int)len) c->Nh++;
+	}
+
+	if (res)
+		SHA1_Update(c,ptr,res);
+}
+
+#define SHA1_Update sha1_update
+
+static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+	unsigned int l;
+	size_t	plen = key->payload_length,
+		iv = 0,		/* explicit IV in TLS 1.1 and later */
+		sha_off = 0;
+#if defined(STITCHED_CALL)
+	size_t	aes_off = 0,
+		blocks;
+
+	sha_off = SHA_CBLOCK-key->md.num;
+#endif
+
+	if (len%AES_BLOCK_SIZE) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==NO_PAYLOAD_LENGTH)
+			plen = len;
+		else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE))
+			return 0;
+		else if (key->aux.tls_ver >= TLS1_1_VERSION)
+			iv = AES_BLOCK_SIZE;
+
+#if defined(STITCHED_CALL)
+		if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) {
+			SHA1_Update(&key->md,in+iv,sha_off);
+
+			aesni_cbc_sha1_enc(in,out,blocks,&key->ks,
+				ctx->iv,&key->md,in+iv+sha_off);
+			blocks *= SHA_CBLOCK;
+			aes_off += blocks;
+			sha_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			sha_off = 0;
+		}
+#endif
+		sha_off += iv;
+		SHA1_Update(&key->md,in+sha_off,plen-sha_off);
+
+		if (plen!=len)	{	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+aes_off,in+aes_off,plen-aes_off);
+
+			/* calculate HMAC and append it to payload */
+			SHA1_Final(out+plen,&key->md);
+			key->md = key->tail;
+			SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH);
+			SHA1_Final(out+plen,&key->md);
+
+			/* pad the payload|hmac */
+			plen += SHA_DIGEST_LENGTH;
+			for (l=len-plen-1;plen<len;plen++) out[plen]=l;
+			/* encrypt HMAC|padding at once */
+			aesni_cbc_encrypt(out+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		} else {
+			aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off,
+					&key->ks,ctx->iv,1);
+		}
+	} else {
+		unsigned char mac[SHA_DIGEST_LENGTH];
+
+		/* decrypt HMAC|padding at once */
+		aesni_cbc_encrypt(in,out,len,
+				&key->ks,ctx->iv,0);
+
+		if (plen) {	/* "TLS" mode of operation */
+			/* figure out payload length */
+			if (len<(size_t)(out[len-1]+1+SHA_DIGEST_LENGTH))
+				return 0;
+
+			len -= (out[len-1]+1+SHA_DIGEST_LENGTH);
+
+			if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3])
+			    >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				iv = AES_BLOCK_SIZE;
+			}
+
+			key->aux.tls_aad[plen-2] = len>>8;
+			key->aux.tls_aad[plen-1] = len;
+
+			/* calculate HMAC and verify it */
+			key->md = key->head;
+			SHA1_Update(&key->md,key->aux.tls_aad,plen);
+			SHA1_Update(&key->md,out+iv,len);
+			SHA1_Final(mac,&key->md);
+
+			key->md = key->tail;
+			SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH);
+			SHA1_Final(mac,&key->md);
+
+			if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH))
+				return 0;
+		} else {
+			SHA1_Update(&key->md,out,len);
+		}
+	}
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return 1;
+	}
+
+static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_AES_HMAC_SHA1 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			SHA1_Init(&key->head);
+			SHA1_Update(&key->head,ptr,arg);
+			SHA1_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		SHA1_Init(&key->head);
+		SHA1_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		SHA1_Init(&key->tail);
+		SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (ctx->encrypt)
+			{
+			key->payload_length = len;
+			if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) {
+				len -= AES_BLOCK_SIZE;
+				p[arg-2] = len>>8;
+				p[arg-1] = len;
+			}
+			key->md = key->head;
+			SHA1_Update(&key->md,p,arg);
+
+			return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)
+				- len);
+			}
+		else
+			{
+			if (arg>13) arg = 13;
+			memcpy(key->aux.tls_aad,ptr,arg);
+			key->payload_length = arg;
+
+			return SHA_DIGEST_LENGTH;
+			}
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_128_cbc_hmac_sha1
+	NID_aes_128_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,16,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher =
+	{
+#ifdef NID_aes_256_cbc_hmac_sha1
+	NID_aes_256_cbc_hmac_sha1,
+#else
+	NID_undef,
+#endif
+	16,32,16,
+	EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER,
+	aesni_cbc_hmac_sha1_init_key,
+	aesni_cbc_hmac_sha1_cipher,
+	NULL,
+	sizeof(EVP_AES_HMAC_SHA1),
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv,
+	EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv,
+	aesni_cbc_hmac_sha1_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_128_cbc_hmac_sha1_cipher:NULL);
+	}
+
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE?
+		&aesni_256_cbc_hmac_sha1_cipher:NULL);
+	}
+#else
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void)
+	{
+	return NULL;
+	}
+#endif
+#endif
diff --git a/src/lib/libcrypto/evp/e_des3.c b/src/lib/libcrypto/evp/e_des3.c
index 3232cfe024..1e69972662 100644
--- a/src/lib/libcrypto/evp/e_des3.c
+++ b/src/lib/libcrypto/evp/e_des3.c
@@ -65,6 +65,8 @@
 #include <openssl/des.h>
 #include <openssl/rand.h>
 
+#ifndef OPENSSL_FIPS
+
 static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 			    const unsigned char *iv,int enc);
 
@@ -311,3 +313,4 @@ const EVP_CIPHER *EVP_des_ede3(void)
 	return &des_ede3_ecb;
 }
 #endif
+#endif
diff --git a/src/lib/libcrypto/evp/e_null.c b/src/lib/libcrypto/evp/e_null.c
index 7cf50e1416..f0c1f78b5f 100644
--- a/src/lib/libcrypto/evp/e_null.c
+++ b/src/lib/libcrypto/evp/e_null.c
@@ -61,6 +61,8 @@
 #include <openssl/evp.h>
 #include <openssl/objects.h>
 
+#ifndef OPENSSL_FIPS
+
 static int null_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
 	const unsigned char *iv,int enc);
 static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
@@ -99,4 +101,4 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
 		memcpy((char *)out,(const char *)in,inl);
 	return 1;
 	}
-
+#endif
diff --git a/src/lib/libcrypto/evp/e_rc2.c b/src/lib/libcrypto/evp/e_rc2.c
index f78d781129..d4c33b58d4 100644
--- a/src/lib/libcrypto/evp/e_rc2.c
+++ b/src/lib/libcrypto/evp/e_rc2.c
@@ -183,7 +183,8 @@ static int rc2_get_asn1_type_and_iv(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 		key_bits =rc2_magic_to_meth((int)num);
 		if (!key_bits)
 			return(-1);
-		if(i > 0) EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1);
+		if(i > 0 && !EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1))
+			return -1;
 		EVP_CIPHER_CTX_ctrl(c, EVP_CTRL_SET_RC2_KEY_BITS, key_bits, NULL);
 		EVP_CIPHER_CTX_set_key_length(c, key_bits / 8);
 		}
diff --git a/src/lib/libcrypto/evp/e_rc4.c b/src/lib/libcrypto/evp/e_rc4.c
index 8b5175e0fd..b4f6bda82d 100644
--- a/src/lib/libcrypto/evp/e_rc4.c
+++ b/src/lib/libcrypto/evp/e_rc4.c
@@ -62,6 +62,7 @@
 #ifndef OPENSSL_NO_RC4
 
 #include <openssl/evp.h>
+#include "evp_locl.h"
 #include <openssl/objects.h>
 #include <openssl/rc4.h>
 
diff --git a/src/lib/libcrypto/evp/e_rc4_hmac_md5.c b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c
new file mode 100644
index 0000000000..56563191ba
--- /dev/null
+++ b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c
@@ -0,0 +1,298 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    licensing@OpenSSL.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/opensslconf.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5)
+
+#include <openssl/evp.h>
+#include <openssl/objects.h>
+#include <openssl/rc4.h>
+#include <openssl/md5.h>
+
+#ifndef EVP_CIPH_FLAG_AEAD_CIPHER
+#define EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
+#define EVP_CTRL_AEAD_TLS1_AAD		0x16
+#define EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+#endif
+
+/* FIXME: surely this is available elsewhere? */
+#define EVP_RC4_KEY_SIZE		16
+
+typedef struct
+    {
+    RC4_KEY		ks;
+    MD5_CTX		head,tail,md;
+    size_t		payload_length;
+    } EVP_RC4_HMAC_MD5;
+
+#define NO_PAYLOAD_LENGTH	((size_t)-1)
+
+void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out,
+		MD5_CTX *ctx,const void *inp,size_t blocks);
+
+#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data)
+
+static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx,
+			const unsigned char *inkey,
+			const unsigned char *iv, int enc)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx),
+		    inkey);
+
+	MD5_Init(&key->head);	/* handy when benchmarking */
+	key->tail = key->head;
+	key->md   = key->head;
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return 1;
+	}
+
+#if	!defined(OPENSSL_NO_ASM) &&	( \
+	defined(__x86_64)	|| defined(__x86_64__)	|| \
+	defined(_M_AMD64)	|| defined(_M_X64)	|| \
+	defined(__INTEL__)		) && \
+	!(defined(__APPLE__) && defined(__MACH__))
+#define	STITCHED_CALL
+#endif
+
+#if !defined(STITCHED_CALL)
+#define	rc4_off 0
+#define	md5_off 0
+#endif
+
+static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		      const unsigned char *in, size_t len)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+#if defined(STITCHED_CALL)
+	size_t	rc4_off = 32-1-(key->ks.x&(32-1)),	/* 32 is $MOD from rc4_md5-x86_64.pl */
+		md5_off = MD5_CBLOCK-key->md.num,
+		blocks;
+	unsigned int l;
+	extern unsigned int OPENSSL_ia32cap_P[];
+#endif
+	size_t	plen = key->payload_length;
+
+	if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0;
+
+	if (ctx->encrypt) {
+		if (plen==NO_PAYLOAD_LENGTH) plen = len;
+#if defined(STITCHED_CALL)
+		/* cipher has to "fall behind" */
+		if (rc4_off>md5_off) md5_off+=MD5_CBLOCK;
+
+		if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) &&
+		    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+			MD5_Update(&key->md,in,md5_off);
+			RC4(&key->ks,rc4_off,in,out);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,in+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			key->md.Nh += blocks>>29;
+			key->md.Nl += blocks<<=3;
+			if (key->md.Nl<(unsigned int)blocks) key->md.Nh++;
+		} else {
+			rc4_off = 0;
+			md5_off = 0;
+		}
+#endif
+		MD5_Update(&key->md,in+md5_off,plen-md5_off);
+
+		if (plen!=len) {	/* "TLS" mode of operation */
+			if (in!=out)
+				memcpy(out+rc4_off,in+rc4_off,plen-rc4_off);
+
+			/* calculate HMAC and append it to payload */
+			MD5_Final(out+plen,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH);
+			MD5_Final(out+plen,&key->md);
+			/* encrypt HMAC at once */
+			RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off);
+		} else {
+			RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		}
+	} else {
+		unsigned char mac[MD5_DIGEST_LENGTH];
+#if defined(STITCHED_CALL)
+		/* digest has to "fall behind" */
+		if (md5_off>rc4_off)	rc4_off += 2*MD5_CBLOCK;
+		else			rc4_off += MD5_CBLOCK;
+
+		if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) &&
+		    (OPENSSL_ia32cap_P[0]&(1<<20))==0) {
+			RC4(&key->ks,rc4_off,in,out);
+			MD5_Update(&key->md,out,md5_off);
+
+			rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off,
+				&key->md,out+md5_off,blocks);
+			blocks *= MD5_CBLOCK;
+			rc4_off += blocks;
+			md5_off += blocks;
+			l = (key->md.Nl+(blocks<<3))&0xffffffffU;
+			if (l<key->md.Nl) key->md.Nh++;
+			key->md.Nl  = l;
+			key->md.Nh += blocks>>29;
+		} else {
+			md5_off=0;
+			rc4_off=0;
+		}
+#endif
+		/* decrypt HMAC at once */
+		RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off);
+		if (plen!=NO_PAYLOAD_LENGTH) {	/* "TLS" mode of operation */
+			MD5_Update(&key->md,out+md5_off,plen-md5_off);
+
+			/* calculate HMAC and verify it */
+			MD5_Final(mac,&key->md);
+			key->md = key->tail;
+			MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH);
+			MD5_Final(mac,&key->md);
+
+			if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH))
+				return 0;
+		} else {
+			MD5_Update(&key->md,out+md5_off,len-md5_off);
+		}
+	}
+
+	key->payload_length = NO_PAYLOAD_LENGTH;
+
+	return 1;
+	}
+
+static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr)
+	{
+	EVP_RC4_HMAC_MD5 *key = data(ctx);
+
+	switch (type)
+		{
+	case EVP_CTRL_AEAD_SET_MAC_KEY:
+		{
+		unsigned int  i;
+		unsigned char hmac_key[64];
+
+		memset (hmac_key,0,sizeof(hmac_key));
+
+		if (arg > (int)sizeof(hmac_key)) {
+			MD5_Init(&key->head);
+			MD5_Update(&key->head,ptr,arg);
+			MD5_Final(hmac_key,&key->head);
+		} else {
+			memcpy(hmac_key,ptr,arg);
+		}
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36;		/* ipad */
+		MD5_Init(&key->head);
+		MD5_Update(&key->head,hmac_key,sizeof(hmac_key));
+
+		for (i=0;i<sizeof(hmac_key);i++)
+			hmac_key[i] ^= 0x36^0x5c;	/* opad */
+		MD5_Init(&key->tail);
+		MD5_Update(&key->tail,hmac_key,sizeof(hmac_key));
+
+		return 1;
+		}
+	case EVP_CTRL_AEAD_TLS1_AAD:
+		{
+		unsigned char *p=ptr;
+		unsigned int   len=p[arg-2]<<8|p[arg-1];
+
+		if (!ctx->encrypt)
+			{
+			len -= MD5_DIGEST_LENGTH;
+			p[arg-2] = len>>8;
+			p[arg-1] = len;
+			}
+		key->payload_length=len;
+		key->md = key->head;
+		MD5_Update(&key->md,p,arg);
+
+		return MD5_DIGEST_LENGTH;
+		}
+	default:
+		return -1;
+		}
+	}
+
+static EVP_CIPHER r4_hmac_md5_cipher=
+	{
+#ifdef NID_rc4_hmac_md5
+	NID_rc4_hmac_md5,
+#else
+	NID_undef,
+#endif
+	1,EVP_RC4_KEY_SIZE,0,
+	EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER,
+	rc4_hmac_md5_init_key,
+	rc4_hmac_md5_cipher,
+	NULL,
+	sizeof(EVP_RC4_HMAC_MD5),
+	NULL,
+	NULL,
+	rc4_hmac_md5_ctrl,
+	NULL
+	};
+
+const EVP_CIPHER *EVP_rc4_hmac_md5(void)
+	{
+	return(&r4_hmac_md5_cipher);
+	}
+#endif
diff --git a/src/lib/libcrypto/evp/evp.h b/src/lib/libcrypto/evp/evp.h
index 9f9795e2d9..0d1b20a7d3 100644
--- a/src/lib/libcrypto/evp/evp.h
+++ b/src/lib/libcrypto/evp/evp.h
@@ -83,7 +83,7 @@
 #define EVP_RC5_32_12_16_KEY_SIZE	16
 */
 #define EVP_MAX_MD_SIZE			64	/* longest known is SHA512 */
-#define EVP_MAX_KEY_LENGTH		32
+#define EVP_MAX_KEY_LENGTH		64
 #define EVP_MAX_IV_LENGTH		16
 #define EVP_MAX_BLOCK_LENGTH		32
 
@@ -116,6 +116,7 @@
 #define EVP_PKEY_DH	NID_dhKeyAgreement
 #define EVP_PKEY_EC	NID_X9_62_id_ecPublicKey
 #define EVP_PKEY_HMAC	NID_hmac
+#define EVP_PKEY_CMAC	NID_cmac
 
 #ifdef	__cplusplus
 extern "C" {
@@ -216,6 +217,8 @@ typedef int evp_verify_method(int type,const unsigned char *m,
 
 #define EVP_MD_FLAG_DIGALGID_CUSTOM		0x0018
 
+#define EVP_MD_FLAG_FIPS	0x0400 /* Note if suitable for use in FIPS mode */
+
 /* Digest ctrls */
 
 #define	EVP_MD_CTRL_DIGALGID			0x1
@@ -325,6 +328,10 @@ struct evp_cipher_st
 #define		EVP_CIPH_CBC_MODE		0x2
 #define		EVP_CIPH_CFB_MODE		0x3
 #define		EVP_CIPH_OFB_MODE		0x4
+#define		EVP_CIPH_CTR_MODE		0x5
+#define		EVP_CIPH_GCM_MODE		0x6
+#define		EVP_CIPH_CCM_MODE		0x7
+#define		EVP_CIPH_XTS_MODE		0x10001
 #define 	EVP_CIPH_MODE			0xF0007
 /* Set if variable length cipher */
 #define 	EVP_CIPH_VARIABLE_LENGTH	0x8
@@ -346,6 +353,15 @@ struct evp_cipher_st
 #define		EVP_CIPH_FLAG_DEFAULT_ASN1	0x1000
 /* Buffer length in bits not bytes: CFB1 mode only */
 #define		EVP_CIPH_FLAG_LENGTH_BITS	0x2000
+/* Note if suitable for use in FIPS mode */
+#define		EVP_CIPH_FLAG_FIPS		0x4000
+/* Allow non FIPS cipher in FIPS mode */
+#define		EVP_CIPH_FLAG_NON_FIPS_ALLOW	0x8000
+/* Cipher handles any and all padding logic as well
+ * as finalisation.
+ */
+#define 	EVP_CIPH_FLAG_CUSTOM_CIPHER	0x100000
+#define		EVP_CIPH_FLAG_AEAD_CIPHER	0x200000
 
 /* ctrl() values */
 
@@ -358,6 +374,34 @@ struct evp_cipher_st
 #define 	EVP_CTRL_RAND_KEY		0x6
 #define 	EVP_CTRL_PBE_PRF_NID		0x7
 #define 	EVP_CTRL_COPY			0x8
+#define 	EVP_CTRL_GCM_SET_IVLEN		0x9
+#define 	EVP_CTRL_GCM_GET_TAG		0x10
+#define 	EVP_CTRL_GCM_SET_TAG		0x11
+#define		EVP_CTRL_GCM_SET_IV_FIXED	0x12
+#define		EVP_CTRL_GCM_IV_GEN		0x13
+#define		EVP_CTRL_CCM_SET_IVLEN		EVP_CTRL_GCM_SET_IVLEN
+#define		EVP_CTRL_CCM_GET_TAG		EVP_CTRL_GCM_GET_TAG
+#define		EVP_CTRL_CCM_SET_TAG		EVP_CTRL_GCM_SET_TAG
+#define		EVP_CTRL_CCM_SET_L		0x14
+#define		EVP_CTRL_CCM_SET_MSGLEN		0x15
+/* AEAD cipher deduces payload length and returns number of bytes
+ * required to store MAC and eventual padding. Subsequent call to
+ * EVP_Cipher even appends/verifies MAC.
+ */
+#define		EVP_CTRL_AEAD_TLS1_AAD		0x16
+/* Used by composite AEAD ciphers, no-op in GCM, CCM... */
+#define		EVP_CTRL_AEAD_SET_MAC_KEY	0x17
+/* Set the GCM invocation field, decrypt only */
+#define		EVP_CTRL_GCM_SET_IV_INV		0x18
+
+/* GCM TLS constants */
+/* Length of fixed part of IV derived from PRF */
+#define EVP_GCM_TLS_FIXED_IV_LEN			4
+/* Length of explicit part of IV part of TLS records */
+#define EVP_GCM_TLS_EXPLICIT_IV_LEN			8
+/* Length of tag for TLS */
+#define EVP_GCM_TLS_TAG_LEN				16
+
 
 typedef struct evp_cipher_info_st
 	{
@@ -375,7 +419,7 @@ struct evp_cipher_ctx_st
 	unsigned char  oiv[EVP_MAX_IV_LENGTH];	/* original iv */
 	unsigned char  iv[EVP_MAX_IV_LENGTH];	/* working iv */
 	unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */
-	int num;				/* used by cfb/ofb mode */
+	int num;				/* used by cfb/ofb/ctr mode */
 
 	void *app_data;		/* application stuff */
 	int key_len;		/* May change for variable length cipher */
@@ -695,6 +739,9 @@ const EVP_MD *EVP_dev_crypto_md5(void);
 #ifndef OPENSSL_NO_RC4
 const EVP_CIPHER *EVP_rc4(void);
 const EVP_CIPHER *EVP_rc4_40(void);
+#ifndef OPENSSL_NO_MD5
+const EVP_CIPHER *EVP_rc4_hmac_md5(void);
+#endif
 #endif
 #ifndef OPENSSL_NO_IDEA
 const EVP_CIPHER *EVP_idea_ecb(void);
@@ -741,9 +788,10 @@ const EVP_CIPHER *EVP_aes_128_cfb8(void);
 const EVP_CIPHER *EVP_aes_128_cfb128(void);
 # define EVP_aes_128_cfb EVP_aes_128_cfb128
 const EVP_CIPHER *EVP_aes_128_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_128_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_128_gcm(void);
+const EVP_CIPHER *EVP_aes_128_ccm(void);
+const EVP_CIPHER *EVP_aes_128_xts(void);
 const EVP_CIPHER *EVP_aes_192_ecb(void);
 const EVP_CIPHER *EVP_aes_192_cbc(void);
 const EVP_CIPHER *EVP_aes_192_cfb1(void);
@@ -751,9 +799,9 @@ const EVP_CIPHER *EVP_aes_192_cfb8(void);
 const EVP_CIPHER *EVP_aes_192_cfb128(void);
 # define EVP_aes_192_cfb EVP_aes_192_cfb128
 const EVP_CIPHER *EVP_aes_192_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_192_ctr(void);
-#endif
+const EVP_CIPHER *EVP_aes_192_gcm(void);
+const EVP_CIPHER *EVP_aes_192_ccm(void);
 const EVP_CIPHER *EVP_aes_256_ecb(void);
 const EVP_CIPHER *EVP_aes_256_cbc(void);
 const EVP_CIPHER *EVP_aes_256_cfb1(void);
@@ -761,8 +809,13 @@ const EVP_CIPHER *EVP_aes_256_cfb8(void);
 const EVP_CIPHER *EVP_aes_256_cfb128(void);
 # define EVP_aes_256_cfb EVP_aes_256_cfb128
 const EVP_CIPHER *EVP_aes_256_ofb(void);
-#if 0
 const EVP_CIPHER *EVP_aes_256_ctr(void);
+const EVP_CIPHER *EVP_aes_256_gcm(void);
+const EVP_CIPHER *EVP_aes_256_ccm(void);
+const EVP_CIPHER *EVP_aes_256_xts(void);
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void);
+const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void);
 #endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
@@ -1047,13 +1100,22 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth,
 #define EVP_PKEY_CTRL_CMS_DECRYPT	10
 #define EVP_PKEY_CTRL_CMS_SIGN		11
 
+#define EVP_PKEY_CTRL_CIPHER		12
+
 #define EVP_PKEY_ALG_CTRL		0x1000
 
 
 #define EVP_PKEY_FLAG_AUTOARGLEN	2
+/* Method handles all operations: don't assume any digest related
+ * defaults.
+ */
+#define EVP_PKEY_FLAG_SIGCTX_CUSTOM	4
 
 const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type);
 EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags);
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth);
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src);
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth);
 int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth);
 
@@ -1071,7 +1133,7 @@ int EVP_PKEY_CTX_get_operation(EVP_PKEY_CTX *ctx);
 void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen);
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen);
+				const unsigned char *key, int keylen);
 
 void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data);
 void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx);
@@ -1190,8 +1252,13 @@ void ERR_load_EVP_strings(void);
 /* Error codes for the EVP functions. */
 
 /* Function codes. */
+#define EVP_F_AESNI_INIT_KEY				 165
+#define EVP_F_AESNI_XTS_CIPHER				 176
 #define EVP_F_AES_INIT_KEY				 133
+#define EVP_F_AES_XTS					 172
+#define EVP_F_AES_XTS_CIPHER				 175
 #define EVP_F_CAMELLIA_INIT_KEY				 159
+#define EVP_F_CMAC_INIT					 173
 #define EVP_F_D2I_PKEY					 100
 #define EVP_F_DO_SIGVER_INIT				 161
 #define EVP_F_DSAPKEY2PKCS8				 134
@@ -1246,15 +1313,24 @@ void ERR_load_EVP_strings(void);
 #define EVP_F_EVP_RIJNDAEL				 126
 #define EVP_F_EVP_SIGNFINAL				 107
 #define EVP_F_EVP_VERIFYFINAL				 108
+#define EVP_F_FIPS_CIPHERINIT				 166
+#define EVP_F_FIPS_CIPHER_CTX_COPY			 170
+#define EVP_F_FIPS_CIPHER_CTX_CTRL			 167
+#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH		 171
+#define EVP_F_FIPS_DIGESTINIT				 168
+#define EVP_F_FIPS_MD_CTX_COPY				 169
+#define EVP_F_HMAC_INIT_EX				 174
 #define EVP_F_INT_CTX_NEW				 157
 #define EVP_F_PKCS5_PBE_KEYIVGEN			 117
 #define EVP_F_PKCS5_V2_PBE_KEYIVGEN			 118
+#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN			 164
 #define EVP_F_PKCS8_SET_BROKEN				 112
 #define EVP_F_PKEY_SET_TYPE				 158
 #define EVP_F_RC2_MAGIC_TO_METH				 109
 #define EVP_F_RC5_CTRL					 125
 
 /* Reason codes. */
+#define EVP_R_AES_IV_SETUP_FAILED			 162
 #define EVP_R_AES_KEY_SETUP_FAILED			 143
 #define EVP_R_ASN1_LIB					 140
 #define EVP_R_BAD_BLOCK_LENGTH				 136
@@ -1272,6 +1348,7 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_DECODE_ERROR				 114
 #define EVP_R_DIFFERENT_KEY_TYPES			 101
 #define EVP_R_DIFFERENT_PARAMETERS			 153
+#define EVP_R_DISABLED_FOR_FIPS				 163
 #define EVP_R_ENCODE_ERROR				 115
 #define EVP_R_EVP_PBE_CIPHERINIT_ERROR			 119
 #define EVP_R_EXPECTING_AN_RSA_KEY			 127
@@ -1303,6 +1380,7 @@ void ERR_load_EVP_strings(void);
 #define EVP_R_PRIVATE_KEY_DECODE_ERROR			 145
 #define EVP_R_PRIVATE_KEY_ENCODE_ERROR			 146
 #define EVP_R_PUBLIC_KEY_NOT_RSA			 106
+#define EVP_R_TOO_LARGE					 164
 #define EVP_R_UNKNOWN_CIPHER				 160
 #define EVP_R_UNKNOWN_DIGEST				 161
 #define EVP_R_UNKNOWN_PBE_ALGORITHM			 121
diff --git a/src/lib/libcrypto/evp/evp_enc.c b/src/lib/libcrypto/evp/evp_enc.c
index c268d25cb4..0c54f05e6e 100644
--- a/src/lib/libcrypto/evp/evp_enc.c
+++ b/src/lib/libcrypto/evp/evp_enc.c
@@ -64,8 +64,18 @@
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 
+#ifdef OPENSSL_FIPS
+#define M_do_cipher(ctx, out, in, inl) FIPS_cipher(ctx, out, in, inl)
+#else
+#define M_do_cipher(ctx, out, in, inl) ctx->cipher->do_cipher(ctx, out, in, inl)
+#endif
+
+
 const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT;
 
 void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx)
@@ -115,10 +125,14 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 		/* Ensure a context left lying around from last time is cleared
 		 * (the previous check attempted to avoid this if the same
 		 * ENGINE and EVP_CIPHER could be used). */
-		EVP_CIPHER_CTX_cleanup(ctx);
-
-		/* Restore encrypt field: it is zeroed by cleanup */
-		ctx->encrypt = enc;
+		if (ctx->cipher)
+			{
+			unsigned long flags = ctx->flags;
+			EVP_CIPHER_CTX_cleanup(ctx);
+			/* Restore encrypt and flags */
+			ctx->encrypt = enc;
+			ctx->flags = flags;
+			}
 #ifndef OPENSSL_NO_ENGINE
 		if(impl)
 			{
@@ -155,6 +169,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 			ctx->engine = NULL;
 #endif
 
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_cipherinit(ctx, cipher, key, iv, enc);
+#endif
 		ctx->cipher=cipher;
 		if (ctx->cipher->ctx_size)
 			{
@@ -187,6 +205,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp
 		}
 #ifndef OPENSSL_NO_ENGINE
 skip_to_init:
+#endif
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_cipherinit(ctx, cipher, key, iv, enc);
 #endif
 	/* we assume block size is a power of 2 in *cryptUpdate */
 	OPENSSL_assert(ctx->cipher->block_size == 1
@@ -214,6 +236,13 @@ skip_to_init:
 			memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx));
 			break;
 
+			case EVP_CIPH_CTR_MODE:
+			ctx->num = 0;
+			/* Don't reuse IV for CTR mode */
+			if(iv)
+				memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx));
+			break;
+
 			default:
 			return 0;
 			break;
@@ -280,6 +309,16 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	{
 	int i,j,bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, in, inl);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -288,7 +327,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 
 	if(ctx->buf_len == 0 && (inl&(ctx->block_mask)) == 0)
 		{
-		if(ctx->cipher->do_cipher(ctx,out,in,inl))
+		if(M_do_cipher(ctx,out,in,inl))
 			{
 			*outl=inl;
 			return 1;
@@ -315,7 +354,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 			{
 			j=bl-i;
 			memcpy(&(ctx->buf[i]),in,j);
-			if(!ctx->cipher->do_cipher(ctx,out,ctx->buf,bl)) return 0;
+			if(!M_do_cipher(ctx,out,ctx->buf,bl)) return 0;
 			inl-=j;
 			in+=j;
 			out+=bl;
@@ -328,7 +367,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	inl-=i;
 	if (inl > 0)
 		{
-		if(!ctx->cipher->do_cipher(ctx,out,in,inl)) return 0;
+		if(!M_do_cipher(ctx,out,in,inl)) return 0;
 		*outl+=inl;
 		}
 
@@ -350,6 +389,16 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	int n,ret;
 	unsigned int i, b, bl;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		ret = M_do_cipher(ctx, out, NULL, 0);
+		if (ret < 0)
+			return 0;
+		else 
+			*outl = ret;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	OPENSSL_assert(b <= sizeof ctx->buf);
 	if (b == 1)
@@ -372,7 +421,7 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	n=b-bl;
 	for (i=bl; i<b; i++)
 		ctx->buf[i]=n;
-	ret=ctx->cipher->do_cipher(ctx,out,ctx->buf,b);
+	ret=M_do_cipher(ctx,out,ctx->buf,b);
 
 
 	if(ret)
@@ -387,6 +436,19 @@ int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl,
 	int fix_len;
 	unsigned int b;
 
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		fix_len = M_do_cipher(ctx, out, in, inl);
+		if (fix_len < 0)
+			{
+			*outl = 0;
+			return 0;
+			}
+		else
+			*outl = fix_len;
+		return 1;
+		}
+
 	if (inl <= 0)
 		{
 		*outl = 0;
@@ -440,8 +502,18 @@ int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	{
 	int i,n;
 	unsigned int b;
-
 	*outl=0;
+
+	if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)
+		{
+		i = M_do_cipher(ctx, out, NULL, 0);
+		if (i < 0)
+			return 0;
+		else
+			*outl = i;
+		return 1;
+		}
+
 	b=ctx->cipher->block_size;
 	if (ctx->flags & EVP_CIPH_NO_PADDING)
 		{
@@ -496,6 +568,7 @@ void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx)
 
 int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
 	{
+#ifndef OPENSSL_FIPS
 	if (c->cipher != NULL)
 		{
 		if(c->cipher->cleanup && !c->cipher->cleanup(c))
@@ -506,11 +579,15 @@ int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c)
 		}
 	if (c->cipher_data)
 		OPENSSL_free(c->cipher_data);
+#endif
 #ifndef OPENSSL_NO_ENGINE
 	if (c->engine)
 		/* The EVP_CIPHER we used belongs to an ENGINE, release the
 		 * functional reference we held for this reason. */
 		ENGINE_finish(c->engine);
+#endif
+#ifdef OPENSSL_FIPS
+	FIPS_cipher_ctx_cleanup(c);
 #endif
 	memset(c,0,sizeof(EVP_CIPHER_CTX));
 	return 1;
diff --git a/src/lib/libcrypto/evp/evp_err.c b/src/lib/libcrypto/evp/evp_err.c
index d8bfec0959..db0f76d59b 100644
--- a/src/lib/libcrypto/evp/evp_err.c
+++ b/src/lib/libcrypto/evp/evp_err.c
@@ -1,6 +1,6 @@
 /* crypto/evp/evp_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -70,8 +70,13 @@
 
 static ERR_STRING_DATA EVP_str_functs[]=
 	{
+{ERR_FUNC(EVP_F_AESNI_INIT_KEY),	"AESNI_INIT_KEY"},
+{ERR_FUNC(EVP_F_AESNI_XTS_CIPHER),	"AESNI_XTS_CIPHER"},
 {ERR_FUNC(EVP_F_AES_INIT_KEY),	"AES_INIT_KEY"},
+{ERR_FUNC(EVP_F_AES_XTS),	"AES_XTS"},
+{ERR_FUNC(EVP_F_AES_XTS_CIPHER),	"AES_XTS_CIPHER"},
 {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY),	"CAMELLIA_INIT_KEY"},
+{ERR_FUNC(EVP_F_CMAC_INIT),	"CMAC_INIT"},
 {ERR_FUNC(EVP_F_D2I_PKEY),	"D2I_PKEY"},
 {ERR_FUNC(EVP_F_DO_SIGVER_INIT),	"DO_SIGVER_INIT"},
 {ERR_FUNC(EVP_F_DSAPKEY2PKCS8),	"DSAPKEY2PKCS8"},
@@ -86,7 +91,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
 {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX),	"EVP_DigestInit_ex"},
 {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX),	"EVP_EncryptFinal_ex"},
 {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX),	"EVP_MD_CTX_copy_ex"},
-{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_SIZE"},
+{ERR_FUNC(EVP_F_EVP_MD_SIZE),	"EVP_MD_size"},
 {ERR_FUNC(EVP_F_EVP_OPENINIT),	"EVP_OpenInit"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD),	"EVP_PBE_alg_add"},
 {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE),	"EVP_PBE_alg_add_type"},
@@ -126,9 +131,17 @@ static ERR_STRING_DATA EVP_str_functs[]=
 {ERR_FUNC(EVP_F_EVP_RIJNDAEL),	"EVP_RIJNDAEL"},
 {ERR_FUNC(EVP_F_EVP_SIGNFINAL),	"EVP_SignFinal"},
 {ERR_FUNC(EVP_F_EVP_VERIFYFINAL),	"EVP_VerifyFinal"},
+{ERR_FUNC(EVP_F_FIPS_CIPHERINIT),	"FIPS_CIPHERINIT"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_COPY),	"FIPS_CIPHER_CTX_COPY"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_CTRL),	"FIPS_CIPHER_CTX_CTRL"},
+{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH),	"FIPS_CIPHER_CTX_SET_KEY_LENGTH"},
+{ERR_FUNC(EVP_F_FIPS_DIGESTINIT),	"FIPS_DIGESTINIT"},
+{ERR_FUNC(EVP_F_FIPS_MD_CTX_COPY),	"FIPS_MD_CTX_COPY"},
+{ERR_FUNC(EVP_F_HMAC_INIT_EX),	"HMAC_Init_ex"},
 {ERR_FUNC(EVP_F_INT_CTX_NEW),	"INT_CTX_NEW"},
 {ERR_FUNC(EVP_F_PKCS5_PBE_KEYIVGEN),	"PKCS5_PBE_keyivgen"},
 {ERR_FUNC(EVP_F_PKCS5_V2_PBE_KEYIVGEN),	"PKCS5_v2_PBE_keyivgen"},
+{ERR_FUNC(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN),	"PKCS5_V2_PBKDF2_KEYIVGEN"},
 {ERR_FUNC(EVP_F_PKCS8_SET_BROKEN),	"PKCS8_set_broken"},
 {ERR_FUNC(EVP_F_PKEY_SET_TYPE),	"PKEY_SET_TYPE"},
 {ERR_FUNC(EVP_F_RC2_MAGIC_TO_METH),	"RC2_MAGIC_TO_METH"},
@@ -138,6 +151,7 @@ static ERR_STRING_DATA EVP_str_functs[]=
 
 static ERR_STRING_DATA EVP_str_reasons[]=
 	{
+{ERR_REASON(EVP_R_AES_IV_SETUP_FAILED)   ,"aes iv setup failed"},
 {ERR_REASON(EVP_R_AES_KEY_SETUP_FAILED)  ,"aes key setup failed"},
 {ERR_REASON(EVP_R_ASN1_LIB)              ,"asn1 lib"},
 {ERR_REASON(EVP_R_BAD_BLOCK_LENGTH)      ,"bad block length"},
@@ -155,6 +169,7 @@ static ERR_STRING_DATA EVP_str_reasons[]=
 {ERR_REASON(EVP_R_DECODE_ERROR)          ,"decode error"},
 {ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES)   ,"different key types"},
 {ERR_REASON(EVP_R_DIFFERENT_PARAMETERS)  ,"different parameters"},
+{ERR_REASON(EVP_R_DISABLED_FOR_FIPS)     ,"disabled for fips"},
 {ERR_REASON(EVP_R_ENCODE_ERROR)          ,"encode error"},
 {ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"},
 {ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY)  ,"expecting an rsa key"},
@@ -186,6 +201,7 @@ static ERR_STRING_DATA EVP_str_reasons[]=
 {ERR_REASON(EVP_R_PRIVATE_KEY_DECODE_ERROR),"private key decode error"},
 {ERR_REASON(EVP_R_PRIVATE_KEY_ENCODE_ERROR),"private key encode error"},
 {ERR_REASON(EVP_R_PUBLIC_KEY_NOT_RSA)    ,"public key not rsa"},
+{ERR_REASON(EVP_R_TOO_LARGE)             ,"too large"},
 {ERR_REASON(EVP_R_UNKNOWN_CIPHER)        ,"unknown cipher"},
 {ERR_REASON(EVP_R_UNKNOWN_DIGEST)        ,"unknown digest"},
 {ERR_REASON(EVP_R_UNKNOWN_PBE_ALGORITHM) ,"unknown pbe algorithm"},
diff --git a/src/lib/libcrypto/evp/evp_key.c b/src/lib/libcrypto/evp/evp_key.c
index 839d6a3a16..7961fbebf2 100644
--- a/src/lib/libcrypto/evp/evp_key.c
+++ b/src/lib/libcrypto/evp/evp_key.c
@@ -120,7 +120,7 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 	unsigned char md_buf[EVP_MAX_MD_SIZE];
 	int niv,nkey,addmd=0;
 	unsigned int mds=0,i;
-
+	int rv = 0;
 	nkey=type->key_len;
 	niv=type->iv_len;
 	OPENSSL_assert(nkey <= EVP_MAX_KEY_LENGTH);
@@ -134,17 +134,24 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 		if (!EVP_DigestInit_ex(&c,md, NULL))
 			return 0;
 		if (addmd++)
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-		EVP_DigestUpdate(&c,data,datal);
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+		if (!EVP_DigestUpdate(&c,data,datal))
+			goto err;
 		if (salt != NULL)
-			EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN);
-		EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN))
+				goto err;
+		if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+			goto err;
 
 		for (i=1; i<(unsigned int)count; i++)
 			{
-			EVP_DigestInit_ex(&c,md, NULL);
-			EVP_DigestUpdate(&c,&(md_buf[0]),mds);
-			EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds);
+			if (!EVP_DigestInit_ex(&c,md, NULL))
+				goto err;
+			if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds))
+				goto err;
+			if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds))
+				goto err;
 			}
 		i=0;
 		if (nkey)
@@ -173,8 +180,10 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md,
 			}
 		if ((nkey == 0) && (niv == 0)) break;
 		}
+	rv = type->key_len;
+	err:
 	EVP_MD_CTX_cleanup(&c);
 	OPENSSL_cleanse(&(md_buf[0]),EVP_MAX_MD_SIZE);
-	return(type->key_len);
+	return rv;
 	}
 
diff --git a/src/lib/libcrypto/evp/evp_lib.c b/src/lib/libcrypto/evp/evp_lib.c
index 40951a04f0..b180e4828a 100644
--- a/src/lib/libcrypto/evp/evp_lib.c
+++ b/src/lib/libcrypto/evp/evp_lib.c
@@ -67,6 +67,8 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 
 	if (c->cipher->set_asn1_parameters != NULL)
 		ret=c->cipher->set_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_set_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);
@@ -78,6 +80,8 @@ int EVP_CIPHER_asn1_to_param(EVP_CIPHER_CTX *c, ASN1_TYPE *type)
 
 	if (c->cipher->get_asn1_parameters != NULL)
 		ret=c->cipher->get_asn1_parameters(c,type);
+	else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1)
+		ret=EVP_CIPHER_get_asn1_iv(c, type);
 	else
 		ret=-1;
 	return(ret);
diff --git a/src/lib/libcrypto/evp/evp_locl.h b/src/lib/libcrypto/evp/evp_locl.h
index 292d74c188..08c0a66d39 100644
--- a/src/lib/libcrypto/evp/evp_locl.h
+++ b/src/lib/libcrypto/evp/evp_locl.h
@@ -343,3 +343,43 @@ struct evp_pkey_method_st
 	} /* EVP_PKEY_METHOD */;
 
 void evp_pkey_set_cb_translate(BN_GENCB *cb, EVP_PKEY_CTX *ctx);
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+			     ASN1_TYPE *param,
+			     const EVP_CIPHER *c, const EVP_MD *md, int en_de);
+
+#ifdef OPENSSL_FIPS
+
+#ifdef OPENSSL_DOING_MAKEDEPEND
+#undef SHA1_Init
+#undef SHA1_Update
+#undef SHA224_Init
+#undef SHA256_Init
+#undef SHA384_Init
+#undef SHA512_Init
+#undef DES_set_key_unchecked
+#endif
+
+#define RIPEMD160_Init	private_RIPEMD160_Init
+#define WHIRLPOOL_Init	private_WHIRLPOOL_Init
+#define MD5_Init	private_MD5_Init
+#define MD4_Init	private_MD4_Init
+#define MD2_Init	private_MD2_Init
+#define MDC2_Init	private_MDC2_Init
+#define SHA_Init	private_SHA_Init
+#define SHA1_Init	private_SHA1_Init
+#define SHA224_Init	private_SHA224_Init
+#define SHA256_Init	private_SHA256_Init
+#define SHA384_Init	private_SHA384_Init
+#define SHA512_Init	private_SHA512_Init
+
+#define BF_set_key	private_BF_set_key
+#define CAST_set_key	private_CAST_set_key
+#define idea_set_encrypt_key	private_idea_set_encrypt_key
+#define SEED_set_key	private_SEED_set_key
+#define RC2_set_key	private_RC2_set_key
+#define RC4_set_key	private_RC4_set_key
+#define DES_set_key_unchecked	private_DES_set_key_unchecked
+#define Camellia_set_key	private_Camellia_set_key
+
+#endif
diff --git a/src/lib/libcrypto/evp/evp_pbe.c b/src/lib/libcrypto/evp/evp_pbe.c
index c9d932d205..f8c32d825e 100644
--- a/src/lib/libcrypto/evp/evp_pbe.c
+++ b/src/lib/libcrypto/evp/evp_pbe.c
@@ -61,6 +61,7 @@
 #include <openssl/evp.h>
 #include <openssl/pkcs12.h>
 #include <openssl/x509.h>
+#include "evp_locl.h"
 
 /* Password based encryption (PBE) functions */
 
@@ -87,6 +88,10 @@ static const EVP_PBE_CTL builtin_pbe[] =
 	{EVP_PBE_TYPE_OUTER, NID_pbeWithSHA1AndRC2_CBC,
 			NID_rc2_64_cbc, NID_sha1, PKCS5_PBE_keyivgen},
 
+#ifndef OPENSSL_NO_HMAC
+	{EVP_PBE_TYPE_OUTER, NID_id_pbkdf2, -1, -1, PKCS5_v2_PBKDF2_keyivgen},
+#endif
+
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And128BitRC4,
 			NID_rc4, NID_sha1, PKCS12_PBE_keyivgen},
 	{EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And40BitRC4,
diff --git a/src/lib/libcrypto/evp/m_dss.c b/src/lib/libcrypto/evp/m_dss.c
index 48c2689504..4ad63ada6f 100644
--- a/src/lib/libcrypto/evp/m_dss.c
+++ b/src/lib/libcrypto/evp/m_dss.c
@@ -66,6 +66,7 @@
 #endif
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
 
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
@@ -97,3 +98,4 @@ const EVP_MD *EVP_dss(void)
 	return(&dsa_md);
 	}
 #endif
+#endif
diff --git a/src/lib/libcrypto/evp/m_dss1.c b/src/lib/libcrypto/evp/m_dss1.c
index 4f03fb70e0..f80170efeb 100644
--- a/src/lib/libcrypto/evp/m_dss1.c
+++ b/src/lib/libcrypto/evp/m_dss1.c
@@ -68,6 +68,8 @@
 #include <openssl/dsa.h>
 #endif
 
+#ifndef OPENSSL_FIPS 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -98,3 +100,4 @@ const EVP_MD *EVP_dss1(void)
 	return(&dss1_md);
 	}
 #endif
+#endif
diff --git a/src/lib/libcrypto/evp/m_ecdsa.c b/src/lib/libcrypto/evp/m_ecdsa.c
index 8d87a49ebe..4b15fb0f6c 100644
--- a/src/lib/libcrypto/evp/m_ecdsa.c
+++ b/src/lib/libcrypto/evp/m_ecdsa.c
@@ -116,6 +116,8 @@
 #include <openssl/x509.h>
 
 #ifndef OPENSSL_NO_SHA
+#ifndef OPENSSL_FIPS
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void)
 	return(&ecdsa_md);
 	}
 #endif
+#endif
diff --git a/src/lib/libcrypto/evp/m_md4.c b/src/lib/libcrypto/evp/m_md4.c
index 1e0b7c5b42..6d47f61b27 100644
--- a/src/lib/libcrypto/evp/m_md4.c
+++ b/src/lib/libcrypto/evp/m_md4.c
@@ -69,6 +69,8 @@
 #include <openssl/rsa.h>
 #endif
 
+#include "evp_locl.h"
+
 static int init(EVP_MD_CTX *ctx)
 	{ return MD4_Init(ctx->md_data); }
 
diff --git a/src/lib/libcrypto/evp/m_md5.c b/src/lib/libcrypto/evp/m_md5.c
index 63c142119e..9a8bae0258 100644
--- a/src/lib/libcrypto/evp/m_md5.c
+++ b/src/lib/libcrypto/evp/m_md5.c
@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return MD5_Init(ctx->md_data); }
diff --git a/src/lib/libcrypto/evp/m_ripemd.c b/src/lib/libcrypto/evp/m_ripemd.c
index a1d60ee78d..7bf4804cf8 100644
--- a/src/lib/libcrypto/evp/m_ripemd.c
+++ b/src/lib/libcrypto/evp/m_ripemd.c
@@ -68,6 +68,7 @@
 #ifndef OPENSSL_NO_RSA
 #include <openssl/rsa.h>
 #endif
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return RIPEMD160_Init(ctx->md_data); }
diff --git a/src/lib/libcrypto/evp/m_sha1.c b/src/lib/libcrypto/evp/m_sha1.c
index 9a2790fdea..3cb11f1ebb 100644
--- a/src/lib/libcrypto/evp/m_sha1.c
+++ b/src/lib/libcrypto/evp/m_sha1.c
@@ -59,6 +59,8 @@
 #include <stdio.h>
 #include "cryptlib.h"
 
+#ifndef OPENSSL_FIPS
+
 #ifndef OPENSSL_NO_SHA
 
 #include <openssl/evp.h>
@@ -68,6 +70,7 @@
 #include <openssl/rsa.h>
 #endif
 
+
 static int init(EVP_MD_CTX *ctx)
 	{ return SHA1_Init(ctx->md_data); }
 
@@ -202,3 +205,5 @@ static const EVP_MD sha512_md=
 const EVP_MD *EVP_sha512(void)
 	{ return(&sha512_md); }
 #endif	/* ifndef OPENSSL_NO_SHA512 */
+
+#endif
diff --git a/src/lib/libcrypto/evp/m_wp.c b/src/lib/libcrypto/evp/m_wp.c
index 1ce47c040b..c51bc2d5d1 100644
--- a/src/lib/libcrypto/evp/m_wp.c
+++ b/src/lib/libcrypto/evp/m_wp.c
@@ -9,6 +9,7 @@
 #include <openssl/objects.h>
 #include <openssl/x509.h>
 #include <openssl/whrlpool.h>
+#include "evp_locl.h"
 
 static int init(EVP_MD_CTX *ctx)
 	{ return WHIRLPOOL_Init(ctx->md_data); }
diff --git a/src/lib/libcrypto/evp/names.c b/src/lib/libcrypto/evp/names.c
index f2869f5c78..6311ad7cfb 100644
--- a/src/lib/libcrypto/evp/names.c
+++ b/src/lib/libcrypto/evp/names.c
@@ -66,6 +66,10 @@ int EVP_add_cipher(const EVP_CIPHER *c)
 	{
 	int r;
 
+	if (c == NULL) return 0;
+
+	OPENSSL_init();
+
 	r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c);
 	if (r == 0) return(0);
 	check_defer(c->nid);
@@ -78,6 +82,7 @@ int EVP_add_digest(const EVP_MD *md)
 	{
 	int r;
 	const char *name;
+	OPENSSL_init();
 
 	name=OBJ_nid2sn(md->type);
 	r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md);
diff --git a/src/lib/libcrypto/evp/p5_crpt.c b/src/lib/libcrypto/evp/p5_crpt.c
index 7ecfa8dad9..294cc90d87 100644
--- a/src/lib/libcrypto/evp/p5_crpt.c
+++ b/src/lib/libcrypto/evp/p5_crpt.c
@@ -82,6 +82,8 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
 	unsigned char *salt;
 	const unsigned char *pbuf;
 	int mdsize;
+	int rv = 0;
+	EVP_MD_CTX_init(&ctx);
 
 	/* Extract useful info from parameter */
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
@@ -104,29 +106,38 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen,
 	if(!pass) passlen = 0;
 	else if(passlen == -1) passlen = strlen(pass);
 
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, md, NULL);
-	EVP_DigestUpdate(&ctx, pass, passlen);
-	EVP_DigestUpdate(&ctx, salt, saltlen);
+	if (!EVP_DigestInit_ex(&ctx, md, NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, pass, passlen))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx, salt, saltlen))
+		goto err;
 	PBEPARAM_free(pbe);
-	EVP_DigestFinal_ex(&ctx, md_tmp, NULL);
+	if (!EVP_DigestFinal_ex(&ctx, md_tmp, NULL))
+		goto err;
 	mdsize = EVP_MD_size(md);
 	if (mdsize < 0)
 	    return 0;
 	for (i = 1; i < iter; i++) {
-		EVP_DigestInit_ex(&ctx, md, NULL);
-		EVP_DigestUpdate(&ctx, md_tmp, mdsize);
-		EVP_DigestFinal_ex (&ctx, md_tmp, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md, NULL))
+			goto err;
+		if (!EVP_DigestUpdate(&ctx, md_tmp, mdsize))
+			goto err;
+		if (!EVP_DigestFinal_ex (&ctx, md_tmp, NULL))
+			goto err;
 	}
-	EVP_MD_CTX_cleanup(&ctx);
 	OPENSSL_assert(EVP_CIPHER_key_length(cipher) <= (int)sizeof(md_tmp));
 	memcpy(key, md_tmp, EVP_CIPHER_key_length(cipher));
 	OPENSSL_assert(EVP_CIPHER_iv_length(cipher) <= 16);
 	memcpy(iv, md_tmp + (16 - EVP_CIPHER_iv_length(cipher)),
 						 EVP_CIPHER_iv_length(cipher));
-	EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de);
+	if (!EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de))
+		goto err;
 	OPENSSL_cleanse(md_tmp, EVP_MAX_MD_SIZE);
 	OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH);
 	OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH);
-	return 1;
+	rv = 1;
+	err:
+	EVP_MD_CTX_cleanup(&ctx);
+	return rv;
 }
diff --git a/src/lib/libcrypto/evp/p5_crpt2.c b/src/lib/libcrypto/evp/p5_crpt2.c
index 334379f310..975d004df4 100644
--- a/src/lib/libcrypto/evp/p5_crpt2.c
+++ b/src/lib/libcrypto/evp/p5_crpt2.c
@@ -62,6 +62,7 @@
 #include <openssl/x509.h>
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
+#include "evp_locl.h"
 
 /* set this to print out info about the keygen algorithm */
 /* #define DEBUG_PKCS5V2 */
@@ -110,10 +111,14 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen,
 		itmp[1] = (unsigned char)((i >> 16) & 0xff);
 		itmp[2] = (unsigned char)((i >> 8) & 0xff);
 		itmp[3] = (unsigned char)(i & 0xff);
-		HMAC_Init_ex(&hctx, pass, passlen, digest, NULL);
-		HMAC_Update(&hctx, salt, saltlen);
-		HMAC_Update(&hctx, itmp, 4);
-		HMAC_Final(&hctx, digtmp, NULL);
+		if (!HMAC_Init_ex(&hctx, pass, passlen, digest, NULL)
+			|| !HMAC_Update(&hctx, salt, saltlen)
+			|| !HMAC_Update(&hctx, itmp, 4)
+			|| !HMAC_Final(&hctx, digtmp, NULL))
+			{
+			HMAC_CTX_cleanup(&hctx);
+			return 0;
+			}
 		memcpy(p, digtmp, cplen);
 		for(j = 1; j < iter; j++)
 			{
@@ -168,27 +173,24 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
                          ASN1_TYPE *param, const EVP_CIPHER *c, const EVP_MD *md,
                          int en_de)
 {
-	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
 	const unsigned char *pbuf;
-	int saltlen, iter, plen;
-	unsigned int keylen;
+	int plen;
 	PBE2PARAM *pbe2 = NULL;
 	const EVP_CIPHER *cipher;
-	PBKDF2PARAM *kdf = NULL;
-	const EVP_MD *prfmd;
-	int prf_nid, hmac_md_nid;
+
+	int rv = 0;
 
 	if (param == NULL || param->type != V_ASN1_SEQUENCE ||
 	    param->value.sequence == NULL) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	pbuf = param->value.sequence->data;
 	plen = param->value.sequence->length;
 	if(!(pbe2 = d2i_PBE2PARAM(NULL, &pbuf, plen))) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
-		return 0;
+		goto err;
 	}
 
 	/* See if we recognise the key derivation function */
@@ -211,38 +213,63 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 	}
 
 	/* Fixup cipher based on AlgorithmIdentifier */
-	EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de);
+	if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de))
+		goto err;
 	if(EVP_CIPHER_asn1_to_param(ctx, pbe2->encryption->parameter) < 0) {
 		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
 					EVP_R_CIPHER_PARAMETER_ERROR);
 		goto err;
 	}
+	rv = PKCS5_v2_PBKDF2_keyivgen(ctx, pass, passlen,
+					pbe2->keyfunc->parameter, c, md, en_de);
+	err:
+	PBE2PARAM_free(pbe2);
+	return rv;
+}
+
+int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
+                         ASN1_TYPE *param,
+			 const EVP_CIPHER *c, const EVP_MD *md, int en_de)
+{
+	unsigned char *salt, key[EVP_MAX_KEY_LENGTH];
+	const unsigned char *pbuf;
+	int saltlen, iter, plen;
+	int rv = 0;
+	unsigned int keylen = 0;
+	int prf_nid, hmac_md_nid;
+	PBKDF2PARAM *kdf = NULL;
+	const EVP_MD *prfmd;
+
+	if (EVP_CIPHER_CTX_cipher(ctx) == NULL)
+		{
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_NO_CIPHER_SET);
+		goto err;
+		}
 	keylen = EVP_CIPHER_CTX_key_length(ctx);
 	OPENSSL_assert(keylen <= sizeof key);
 
-	/* Now decode key derivation function */
+	/* Decode parameter */
 
-	if(!pbe2->keyfunc->parameter ||
-		 (pbe2->keyfunc->parameter->type != V_ASN1_SEQUENCE))
+	if(!param || (param->type != V_ASN1_SEQUENCE))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 		}
 
-	pbuf = pbe2->keyfunc->parameter->value.sequence->data;
-	plen = pbe2->keyfunc->parameter->value.sequence->length;
+	pbuf = param->value.sequence->data;
+	plen = param->value.sequence->length;
+
 	if(!(kdf = d2i_PBKDF2PARAM(NULL, &pbuf, plen)) ) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR);
 		goto err;
 	}
 
-	PBE2PARAM_free(pbe2);
-	pbe2 = NULL;
+	keylen = EVP_CIPHER_CTX_key_length(ctx);
 
 	/* Now check the parameters of the kdf */
 
 	if(kdf->keylength && (ASN1_INTEGER_get(kdf->keylength) != (int)keylen)){
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_KEYLENGTH);
 		goto err;
 	}
@@ -254,19 +281,19 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 
 	if (!EVP_PBE_find(EVP_PBE_TYPE_PRF, prf_nid, NULL, &hmac_md_nid, 0))
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	prfmd = EVP_get_digestbynid(hmac_md_nid);
 	if (prfmd == NULL)
 		{
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF);
 		goto err;
 		}
 
 	if(kdf->salt->type != V_ASN1_OCTET_STRING) {
-		EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,
+		EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,
 						EVP_R_UNSUPPORTED_SALT_TYPE);
 		goto err;
 	}
@@ -278,15 +305,11 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen,
 	if(!PKCS5_PBKDF2_HMAC(pass, passlen, salt, saltlen, iter, prfmd,
 						   keylen, key))
 		goto err;
-	EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
-	OPENSSL_cleanse(key, keylen);
-	PBKDF2PARAM_free(kdf);
-	return 1;
-
+	rv = EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de);
 	err:
-	PBE2PARAM_free(pbe2);
+	OPENSSL_cleanse(key, keylen);
 	PBKDF2PARAM_free(kdf);
-	return 0;
+	return rv;
 }
 
 #ifdef DEBUG_PKCS5V2
diff --git a/src/lib/libcrypto/evp/p_open.c b/src/lib/libcrypto/evp/p_open.c
index 53a59a295c..c748fbea87 100644
--- a/src/lib/libcrypto/evp/p_open.c
+++ b/src/lib/libcrypto/evp/p_open.c
@@ -115,7 +115,8 @@ int EVP_OpenFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	int i;
 
 	i=EVP_DecryptFinal_ex(ctx,out,outl);
-	EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i)
+		i = EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return(i);
 	}
 #else /* !OPENSSL_NO_RSA */
diff --git a/src/lib/libcrypto/evp/p_seal.c b/src/lib/libcrypto/evp/p_seal.c
index d8324526e7..e5919b0fbf 100644
--- a/src/lib/libcrypto/evp/p_seal.c
+++ b/src/lib/libcrypto/evp/p_seal.c
@@ -110,6 +110,7 @@ int EVP_SealFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl)
 	{
 	int i;
 	i = EVP_EncryptFinal_ex(ctx,out,outl);
-	EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
+	if (i) 
+		i = EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL);
 	return i;
 	}
diff --git a/src/lib/libcrypto/evp/p_sign.c b/src/lib/libcrypto/evp/p_sign.c
index bb893f5bde..dfa48c157c 100644
--- a/src/lib/libcrypto/evp/p_sign.c
+++ b/src/lib/libcrypto/evp/p_sign.c
@@ -80,18 +80,20 @@ int EVP_SignFinal(EVP_MD_CTX *ctx, unsigned char *sigret, unsigned int *siglen,
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i=0,ok=0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	*siglen=0;
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);   
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;  
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		size_t sltmp = (size_t)EVP_PKEY_size(pkey);
 		i = 0;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);
diff --git a/src/lib/libcrypto/evp/p_verify.c b/src/lib/libcrypto/evp/p_verify.c
index 41d4b67130..5f5c409f45 100644
--- a/src/lib/libcrypto/evp/p_verify.c
+++ b/src/lib/libcrypto/evp/p_verify.c
@@ -67,17 +67,19 @@ int EVP_VerifyFinal(EVP_MD_CTX *ctx, const unsigned char *sigbuf,
 	{
 	unsigned char m[EVP_MAX_MD_SIZE];
 	unsigned int m_len;
-	int i,ok=0,v;
+	int i=-1,ok=0,v;
 	EVP_MD_CTX tmp_ctx;
+	EVP_PKEY_CTX *pkctx = NULL;
 
 	EVP_MD_CTX_init(&tmp_ctx);
-	EVP_MD_CTX_copy_ex(&tmp_ctx,ctx);     
-	EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len);
+	if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx))
+		goto err;    
+	if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len))
+		goto err;
 	EVP_MD_CTX_cleanup(&tmp_ctx);
 
 	if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE)
 		{
-		EVP_PKEY_CTX *pkctx = NULL;
 		i = -1;
 		pkctx = EVP_PKEY_CTX_new(pkey, NULL);
 		if (!pkctx)
diff --git a/src/lib/libcrypto/evp/pmeth_gn.c b/src/lib/libcrypto/evp/pmeth_gn.c
index 5d74161a09..4651c81370 100644
--- a/src/lib/libcrypto/evp/pmeth_gn.c
+++ b/src/lib/libcrypto/evp/pmeth_gn.c
@@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx)
 	}
 
 EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
-				unsigned char *key, int keylen)
+				const unsigned char *key, int keylen)
 	{
 	EVP_PKEY_CTX *mac_ctx = NULL;
 	EVP_PKEY *mac_key = NULL;
@@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e,
 	if (EVP_PKEY_keygen_init(mac_ctx) <= 0)
 		goto merr;
 	if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN,
-				EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0)
+				EVP_PKEY_CTRL_SET_MAC_KEY,
+				keylen, (void *)key) <= 0)
 		goto merr;
 	if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0)
 		goto merr;
diff --git a/src/lib/libcrypto/evp/pmeth_lib.c b/src/lib/libcrypto/evp/pmeth_lib.c
index 5481d4b8a5..acfa7b6f87 100644
--- a/src/lib/libcrypto/evp/pmeth_lib.c
+++ b/src/lib/libcrypto/evp/pmeth_lib.c
@@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD)
 STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL;
 
 extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth;
-extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth;
+extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth;
 
 static const EVP_PKEY_METHOD *standard_methods[] =
 	{
@@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] =
 	&ec_pkey_meth,
 #endif
 	&hmac_pkey_meth,
+	&cmac_pkey_meth
 	};
 
 DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *,
@@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
 	if (!pmeth)
 		return NULL;
 
+	memset(pmeth, 0, sizeof(EVP_PKEY_METHOD));
+
 	pmeth->pkey_id = id;
 	pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC;
 
@@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags)
 	return pmeth;
 	}
 
+void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags,
+				const EVP_PKEY_METHOD *meth)
+	{
+	if (ppkey_id)
+		*ppkey_id = meth->pkey_id;
+	if (pflags)
+		*pflags = meth->flags;
+	}
+
+void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src)
+	{
+
+	dst->init = src->init;
+	dst->copy = src->copy;
+	dst->cleanup = src->cleanup;
+
+	dst->paramgen_init = src->paramgen_init;
+	dst->paramgen = src->paramgen;
+
+	dst->keygen_init = src->keygen_init;
+	dst->keygen = src->keygen;
+
+	dst->sign_init = src->sign_init;
+	dst->sign = src->sign;
+
+	dst->verify_init = src->verify_init;
+	dst->verify = src->verify;
+
+	dst->verify_recover_init = src->verify_recover_init;
+	dst->verify_recover = src->verify_recover;
+
+	dst->signctx_init = src->signctx_init;
+	dst->signctx = src->signctx;
+
+	dst->verifyctx_init = src->verifyctx_init;
+	dst->verifyctx = src->verifyctx;
+
+	dst->encrypt_init = src->encrypt_init;
+	dst->encrypt = src->encrypt;
+
+	dst->decrypt_init = src->decrypt_init;
+	dst->decrypt = src->decrypt;
+
+	dst->derive_init = src->derive_init;
+	dst->derive = src->derive;
+
+	dst->ctrl = src->ctrl;
+	dst->ctrl_str = src->ctrl_str;
+	}
+
 void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth)
 	{
 	if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC))
diff --git a/src/lib/libcrypto/hmac/hm_ameth.c b/src/lib/libcrypto/hmac/hm_ameth.c
index 6d8a89149e..e03f24aeda 100644
--- a/src/lib/libcrypto/hmac/hm_ameth.c
+++ b/src/lib/libcrypto/hmac/hm_ameth.c
@@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth =
 
 	hmac_size,
 	0,
-	0,0,0,0,0,0,
+	0,0,0,0,0,0,0,
 
 	hmac_key_free,
 	hmac_pkey_ctrl,
diff --git a/src/lib/libcrypto/hmac/hm_pmeth.c b/src/lib/libcrypto/hmac/hm_pmeth.c
index 71e8567a14..0daa44511d 100644
--- a/src/lib/libcrypto/hmac/hm_pmeth.c
+++ b/src/lib/libcrypto/hmac/hm_pmeth.c
@@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src)
 	dctx = dst->data;
 	dctx->md = sctx->md;
 	HMAC_CTX_init(&dctx->ctx);
-	HMAC_CTX_copy(&dctx->ctx, &sctx->ctx);
+	if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx))
+		return 0;
 	if (sctx->ktmp.data)
 		{
 		if (!ASN1_OCTET_STRING_set(&dctx->ktmp,
@@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey)
 static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count)
 	{
 	HMAC_PKEY_CTX *hctx = ctx->pctx->data;
-	HMAC_Update(&hctx->ctx, data, count);
+	if (!HMAC_Update(&hctx->ctx, data, count))
+		return 0;
 	return 1;
 	}
 
@@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 	if (!sig)
 		return 1;
 
-	HMAC_Final(&hctx->ctx, sig, &hlen);
+	if (!HMAC_Final(&hctx->ctx, sig, &hlen))
+		return 0;
 	*siglen = (size_t)hlen;
 	return 1;
 	}
@@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr;
-		HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
-				ctx->engine);
+		if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md,
+				ctx->engine))
+			return 0;
 		break;
 
 		default:
diff --git a/src/lib/libcrypto/hmac/hmac.c b/src/lib/libcrypto/hmac/hmac.c
index 6c98fc43a3..ba27cbf56f 100644
--- a/src/lib/libcrypto/hmac/hmac.c
+++ b/src/lib/libcrypto/hmac/hmac.c
@@ -61,12 +61,34 @@
 #include "cryptlib.h"
 #include <openssl/hmac.h>
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, int len,
 		  const EVP_MD *md, ENGINE *impl)
 	{
 	int i,j,reset=0;
 	unsigned char pad[HMAC_MAX_MD_CBLOCK];
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		{
+		/* If we have an ENGINE need to allow non FIPS */
+		if ((impl || ctx->i_ctx.engine)
+			&&  !(ctx->i_ctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW))
+			{
+			EVPerr(EVP_F_HMAC_INIT_EX, EVP_R_DISABLED_FOR_FIPS);
+			return 0;
+			}
+		/* Other algorithm blocking will be done in FIPS_cmac_init,
+		 * via FIPS_hmac_init_ex().
+		 */
+		if (!impl && !ctx->i_ctx.engine)
+			return FIPS_hmac_init_ex(ctx, key, len, md, NULL);
+		}
+#endif
+
 	if (md != NULL)
 		{
 		reset=1;
@@ -133,6 +155,10 @@ int HMAC_Init(HMAC_CTX *ctx, const void *key, int len, const EVP_MD *md)
 
 int HMAC_Update(HMAC_CTX *ctx, const unsigned char *data, size_t len)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_update(ctx, data, len);
+#endif
 	return EVP_DigestUpdate(&ctx->md_ctx,data,len);
 	}
 
@@ -140,6 +166,10 @@ int HMAC_Final(HMAC_CTX *ctx, unsigned char *md, unsigned int *len)
 	{
 	unsigned int i;
 	unsigned char buf[EVP_MAX_MD_SIZE];
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		return FIPS_hmac_final(ctx, md, len);
+#endif
 
 	if (!EVP_DigestFinal_ex(&ctx->md_ctx,buf,&i))
 		goto err;
@@ -179,6 +209,13 @@ int HMAC_CTX_copy(HMAC_CTX *dctx, HMAC_CTX *sctx)
 
 void HMAC_CTX_cleanup(HMAC_CTX *ctx)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !ctx->i_ctx.engine)
+		{
+		FIPS_hmac_ctx_cleanup(ctx);
+		return;
+		}
+#endif
 	EVP_MD_CTX_cleanup(&ctx->i_ctx);
 	EVP_MD_CTX_cleanup(&ctx->o_ctx);
 	EVP_MD_CTX_cleanup(&ctx->md_ctx);
diff --git a/src/lib/libcrypto/ia64cpuid.S b/src/lib/libcrypto/ia64cpuid.S
index d705fff7ee..7832b9b640 100644
--- a/src/lib/libcrypto/ia64cpuid.S
+++ b/src/lib/libcrypto/ia64cpuid.S
@@ -26,7 +26,7 @@ OPENSSL_atomic_add:
 { .mii;	mov		ar.ccv=r2
 	add		r8=r2,r33
 	mov		r3=r2		};;
-{ .mmi;	mf
+{ .mmi;	mf;;
 	cmpxchg4.acq	r2=[r32],r8,ar.ccv
 	nop.i		0		};;
 { .mib;	cmp.ne		p6,p0=r2,r3
diff --git a/src/lib/libcrypto/idea/i_cbc.c b/src/lib/libcrypto/idea/i_cbc.c
new file mode 100644
index 0000000000..ecb9cb8b83
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_cbc.c
@@ -0,0 +1,168 @@
+/* crypto/idea/i_cbc.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+
+void idea_cbc_encrypt(const unsigned char *in, unsigned char *out, long length,
+	     IDEA_KEY_SCHEDULE *ks, unsigned char *iv, int encrypt)
+	{
+	register unsigned long tin0,tin1;
+	register unsigned long tout0,tout1,xor0,xor1;
+	register long l=length;
+	unsigned long tin[2];
+
+	if (encrypt)
+		{
+		n2l(iv,tout0);
+		n2l(iv,tout1);
+		iv-=8;
+		for (l-=8; l>=0; l-=8)
+			{
+			n2l(in,tin0);
+			n2l(in,tin1);
+			tin0^=tout0;
+			tin1^=tout1;
+			tin[0]=tin0;
+			tin[1]=tin1;
+			idea_encrypt(tin,ks);
+			tout0=tin[0]; l2n(tout0,out);
+			tout1=tin[1]; l2n(tout1,out);
+			}
+		if (l != -8)
+			{
+			n2ln(in,tin0,tin1,l+8);
+			tin0^=tout0;
+			tin1^=tout1;
+			tin[0]=tin0;
+			tin[1]=tin1;
+			idea_encrypt(tin,ks);
+			tout0=tin[0]; l2n(tout0,out);
+			tout1=tin[1]; l2n(tout1,out);
+			}
+		l2n(tout0,iv);
+		l2n(tout1,iv);
+		}
+	else
+		{
+		n2l(iv,xor0);
+		n2l(iv,xor1);
+		iv-=8;
+		for (l-=8; l>=0; l-=8)
+			{
+			n2l(in,tin0); tin[0]=tin0;
+			n2l(in,tin1); tin[1]=tin1;
+			idea_encrypt(tin,ks);
+			tout0=tin[0]^xor0;
+			tout1=tin[1]^xor1;
+			l2n(tout0,out);
+			l2n(tout1,out);
+			xor0=tin0;
+			xor1=tin1;
+			}
+		if (l != -8)
+			{
+			n2l(in,tin0); tin[0]=tin0;
+			n2l(in,tin1); tin[1]=tin1;
+			idea_encrypt(tin,ks);
+			tout0=tin[0]^xor0;
+			tout1=tin[1]^xor1;
+			l2nn(tout0,tout1,out,l+8);
+			xor0=tin0;
+			xor1=tin1;
+			}
+		l2n(xor0,iv);
+		l2n(xor1,iv);
+		}
+	tin0=tin1=tout0=tout1=xor0=xor1=0;
+	tin[0]=tin[1]=0;
+	}
+
+void idea_encrypt(unsigned long *d, IDEA_KEY_SCHEDULE *key)
+	{
+	register IDEA_INT *p;
+	register unsigned long x1,x2,x3,x4,t0,t1,ul;
+
+	x2=d[0];
+	x1=(x2>>16);
+	x4=d[1];
+	x3=(x4>>16);
+
+	p= &(key->data[0][0]);
+
+	E_IDEA(0);
+	E_IDEA(1);
+	E_IDEA(2);
+	E_IDEA(3);
+	E_IDEA(4);
+	E_IDEA(5);
+	E_IDEA(6);
+	E_IDEA(7);
+
+	x1&=0xffff;
+	idea_mul(x1,x1,*p,ul); p++;
+
+	t0= x3+ *(p++);
+	t1= x2+ *(p++);
+
+	x4&=0xffff;
+	idea_mul(x4,x4,*p,ul);
+
+	d[0]=(t0&0xffff)|((x1&0xffff)<<16);
+	d[1]=(x4&0xffff)|((t1&0xffff)<<16);
+	}
diff --git a/src/lib/libcrypto/idea/i_cfb64.c b/src/lib/libcrypto/idea/i_cfb64.c
new file mode 100644
index 0000000000..66d49d520e
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_cfb64.c
@@ -0,0 +1,122 @@
+/* crypto/idea/i_cfb64.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+
+/* The input and output encrypted as though 64bit cfb mode is being
+ * used.  The extra state information to record how much of the
+ * 64bit block we have used is contained in *num;
+ */
+
+void idea_cfb64_encrypt(const unsigned char *in, unsigned char *out,
+			long length, IDEA_KEY_SCHEDULE *schedule,
+			unsigned char *ivec, int *num, int encrypt)
+	{
+	register unsigned long v0,v1,t;
+	register int n= *num;
+	register long l=length;
+	unsigned long ti[2];
+	unsigned char *iv,c,cc;
+
+	iv=(unsigned char *)ivec;
+	if (encrypt)
+		{
+		while (l--)
+			{
+			if (n == 0)
+				{
+				n2l(iv,v0); ti[0]=v0;
+				n2l(iv,v1); ti[1]=v1;
+				idea_encrypt((unsigned long *)ti,schedule);
+				iv=(unsigned char *)ivec;
+				t=ti[0]; l2n(t,iv);
+				t=ti[1]; l2n(t,iv);
+				iv=(unsigned char *)ivec;
+				}
+			c= *(in++)^iv[n];
+			*(out++)=c;
+			iv[n]=c;
+			n=(n+1)&0x07;
+			}
+		}
+	else
+		{
+		while (l--)
+			{
+			if (n == 0)
+				{
+				n2l(iv,v0); ti[0]=v0;
+				n2l(iv,v1); ti[1]=v1;
+				idea_encrypt((unsigned long *)ti,schedule);
+				iv=(unsigned char *)ivec;
+				t=ti[0]; l2n(t,iv);
+				t=ti[1]; l2n(t,iv);
+				iv=(unsigned char *)ivec;
+				}
+			cc= *(in++);
+			c=iv[n];
+			iv[n]=cc;
+			*(out++)=c^cc;
+			n=(n+1)&0x07;
+			}
+		}
+	v0=v1=ti[0]=ti[1]=t=c=cc=0;
+	*num=n;
+	}
+
diff --git a/src/lib/libcrypto/idea/i_ecb.c b/src/lib/libcrypto/idea/i_ecb.c
new file mode 100644
index 0000000000..fef38230a7
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_ecb.c
@@ -0,0 +1,85 @@
+/* crypto/idea/i_ecb.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+#include <openssl/opensslv.h>
+
+const char IDEA_version[]="IDEA" OPENSSL_VERSION_PTEXT;
+
+const char *idea_options(void)
+	{
+	if (sizeof(short) != sizeof(IDEA_INT))
+		return("idea(int)");
+	else
+		return("idea(short)");
+	}
+
+void idea_ecb_encrypt(const unsigned char *in, unsigned char *out,
+	     IDEA_KEY_SCHEDULE *ks)
+	{
+	unsigned long l0,l1,d[2];
+
+	n2l(in,l0); d[0]=l0;
+	n2l(in,l1); d[1]=l1;
+	idea_encrypt(d,ks);
+	l0=d[0]; l2n(l0,out);
+	l1=d[1]; l2n(l1,out);
+	l0=l1=d[0]=d[1]=0;
+	}
+
diff --git a/src/lib/libcrypto/idea/i_ofb64.c b/src/lib/libcrypto/idea/i_ofb64.c
new file mode 100644
index 0000000000..e749e88e34
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_ofb64.c
@@ -0,0 +1,111 @@
+/* crypto/idea/i_ofb64.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+
+/* The input and output encrypted as though 64bit ofb mode is being
+ * used.  The extra state information to record how much of the
+ * 64bit block we have used is contained in *num;
+ */
+void idea_ofb64_encrypt(const unsigned char *in, unsigned char *out,
+			long length, IDEA_KEY_SCHEDULE *schedule,
+			unsigned char *ivec, int *num)
+	{
+	register unsigned long v0,v1,t;
+	register int n= *num;
+	register long l=length;
+	unsigned char d[8];
+	register char *dp;
+	unsigned long ti[2];
+	unsigned char *iv;
+	int save=0;
+
+	iv=(unsigned char *)ivec;
+	n2l(iv,v0);
+	n2l(iv,v1);
+	ti[0]=v0;
+	ti[1]=v1;
+	dp=(char *)d;
+	l2n(v0,dp);
+	l2n(v1,dp);
+	while (l--)
+		{
+		if (n == 0)
+			{
+			idea_encrypt((unsigned long *)ti,schedule);
+			dp=(char *)d;
+			t=ti[0]; l2n(t,dp);
+			t=ti[1]; l2n(t,dp);
+			save++;
+			}
+		*(out++)= *(in++)^d[n];
+		n=(n+1)&0x07;
+		}
+	if (save)
+		{
+		v0=ti[0];
+		v1=ti[1];
+		iv=(unsigned char *)ivec;
+		l2n(v0,iv);
+		l2n(v1,iv);
+		}
+	t=v0=v1=ti[0]=ti[1]=0;
+	*num=n;
+	}
+
diff --git a/src/lib/libcrypto/idea/i_skey.c b/src/lib/libcrypto/idea/i_skey.c
new file mode 100644
index 0000000000..afb830964d
--- /dev/null
+++ b/src/lib/libcrypto/idea/i_skey.c
@@ -0,0 +1,164 @@
+/* crypto/idea/i_skey.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <openssl/crypto.h>
+#include <openssl/idea.h>
+#include "idea_lcl.h"
+
+static IDEA_INT inverse(unsigned int xin);
+void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(IDEA);
+	private_idea_set_encrypt_key(key, ks);
+	}
+void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks)
+#endif
+	{
+	int i;
+	register IDEA_INT *kt,*kf,r0,r1,r2;
+
+	kt= &(ks->data[0][0]);
+	n2s(key,kt[0]); n2s(key,kt[1]); n2s(key,kt[2]); n2s(key,kt[3]);
+	n2s(key,kt[4]); n2s(key,kt[5]); n2s(key,kt[6]); n2s(key,kt[7]);
+
+	kf=kt;
+	kt+=8;
+	for (i=0; i<6; i++)
+		{
+		r2= kf[1];
+		r1= kf[2];
+		*(kt++)= ((r2<<9) | (r1>>7))&0xffff;
+		r0= kf[3];
+		*(kt++)= ((r1<<9) | (r0>>7))&0xffff;
+		r1= kf[4];
+		*(kt++)= ((r0<<9) | (r1>>7))&0xffff;
+		r0= kf[5];
+		*(kt++)= ((r1<<9) | (r0>>7))&0xffff;
+		r1= kf[6];
+		*(kt++)= ((r0<<9) | (r1>>7))&0xffff;
+		r0= kf[7];
+		*(kt++)= ((r1<<9) | (r0>>7))&0xffff;
+		r1= kf[0];
+		if (i >= 5) break;
+		*(kt++)= ((r0<<9) | (r1>>7))&0xffff;
+		*(kt++)= ((r1<<9) | (r2>>7))&0xffff;
+		kf+=8;
+		}
+	}
+
+void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk)
+	{
+	int r;
+	register IDEA_INT *fp,*tp,t;
+
+	tp= &(dk->data[0][0]);
+	fp= &(ek->data[8][0]);
+	for (r=0; r<9; r++)
+		{
+		*(tp++)=inverse(fp[0]);
+		*(tp++)=((int)(0x10000L-fp[2])&0xffff);
+		*(tp++)=((int)(0x10000L-fp[1])&0xffff);
+		*(tp++)=inverse(fp[3]);
+		if (r == 8) break;
+		fp-=6;
+		*(tp++)=fp[4];
+		*(tp++)=fp[5];
+		}
+
+	tp= &(dk->data[0][0]);
+	t=tp[1];
+	tp[1]=tp[2];
+	tp[2]=t;
+
+	t=tp[49];
+	tp[49]=tp[50];
+	tp[50]=t;
+	}
+
+/* taken directly from the 'paper' I'll have a look at it later */
+static IDEA_INT inverse(unsigned int xin)
+	{
+	long n1,n2,q,r,b1,b2,t;
+
+	if (xin == 0)
+		b2=0;
+	else
+		{
+		n1=0x10001;
+		n2=xin;
+		b2=1;
+		b1=0;
+
+		do	{
+			r=(n1%n2);
+			q=(n1-r)/n2;
+			if (r == 0)
+				{ if (b2 < 0) b2=0x10001+b2; }
+			else
+				{
+				n1=n2;
+				n2=r;
+				t=b2;
+				b2=b1-q*b2;
+				b1=t;
+				}
+			} while (r != 0);
+		}
+	return((IDEA_INT)b2);
+	}
diff --git a/src/lib/libcrypto/idea/idea.h b/src/lib/libcrypto/idea/idea.h
index 5782e54b0f..e9a1e7f1a5 100644
--- a/src/lib/libcrypto/idea/idea.h
+++ b/src/lib/libcrypto/idea/idea.h
@@ -83,6 +83,9 @@ typedef struct idea_key_st
 const char *idea_options(void);
 void idea_ecb_encrypt(const unsigned char *in, unsigned char *out,
 	IDEA_KEY_SCHEDULE *ks);
+#ifdef OPENSSL_FIPS
+void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks);
+#endif
 void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks);
 void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk);
 void idea_cbc_encrypt(const unsigned char *in, unsigned char *out,
diff --git a/src/lib/libcrypto/idea/idea_lcl.h b/src/lib/libcrypto/idea/idea_lcl.h
new file mode 100644
index 0000000000..f3dbfa67e9
--- /dev/null
+++ b/src/lib/libcrypto/idea/idea_lcl.h
@@ -0,0 +1,215 @@
+/* crypto/idea/idea_lcl.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+/* The new form of this macro (check if the a*b == 0) was suggested by 
+ * Colin Plumb <colin@nyx10.cs.du.edu> */
+/* Removal of the inner if from from Wei Dai 24/4/96 */
+#define idea_mul(r,a,b,ul) \
+ul=(unsigned long)a*b; \
+if (ul != 0) \
+	{ \
+	r=(ul&0xffff)-(ul>>16); \
+	r-=((r)>>16); \
+	} \
+else \
+	r=(-(int)a-b+1); /* assuming a or b is 0 and in range */ 
+
+#ifdef undef
+#define idea_mul(r,a,b,ul,sl) \
+if (a == 0) r=(0x10001-b)&0xffff; \
+else if (b == 0) r=(0x10001-a)&0xffff; \
+else	{ \
+	ul=(unsigned long)a*b; \
+	sl=(ul&0xffff)-(ul>>16); \
+	if (sl <= 0) sl+=0x10001; \
+	r=sl; \
+	} 
+#endif
+
+/*  7/12/95 - Many thanks to Rhys Weatherley <rweather@us.oracle.com>
+ * for pointing out that I was assuming little endian
+ * byte order for all quantities what idea
+ * actually used bigendian.  No where in the spec does it mention
+ * this, it is all in terms of 16 bit numbers and even the example
+ * does not use byte streams for the input example :-(.
+ * If you byte swap each pair of input, keys and iv, the functions
+ * would produce the output as the old version :-(.
+ */
+
+/* NOTE - c is not incremented as per n2l */
+#define n2ln(c,l1,l2,n)	{ \
+			c+=n; \
+			l1=l2=0; \
+			switch (n) { \
+			case 8: l2 =((unsigned long)(*(--(c))))    ; \
+			case 7: l2|=((unsigned long)(*(--(c))))<< 8; \
+			case 6: l2|=((unsigned long)(*(--(c))))<<16; \
+			case 5: l2|=((unsigned long)(*(--(c))))<<24; \
+			case 4: l1 =((unsigned long)(*(--(c))))    ; \
+			case 3: l1|=((unsigned long)(*(--(c))))<< 8; \
+			case 2: l1|=((unsigned long)(*(--(c))))<<16; \
+			case 1: l1|=((unsigned long)(*(--(c))))<<24; \
+				} \
+			}
+
+/* NOTE - c is not incremented as per l2n */
+#define l2nn(l1,l2,c,n)	{ \
+			c+=n; \
+			switch (n) { \
+			case 8: *(--(c))=(unsigned char)(((l2)    )&0xff); \
+			case 7: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
+			case 6: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
+			case 5: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
+			case 4: *(--(c))=(unsigned char)(((l1)    )&0xff); \
+			case 3: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
+			case 2: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
+			case 1: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
+				} \
+			}
+
+#undef n2l
+#define n2l(c,l)        (l =((unsigned long)(*((c)++)))<<24L, \
+                         l|=((unsigned long)(*((c)++)))<<16L, \
+                         l|=((unsigned long)(*((c)++)))<< 8L, \
+                         l|=((unsigned long)(*((c)++))))
+
+#undef l2n
+#define l2n(l,c)        (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)>>16L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \
+                         *((c)++)=(unsigned char)(((l)     )&0xff))
+
+#undef s2n
+#define s2n(l,c)	(*((c)++)=(unsigned char)(((l)     )&0xff), \
+			 *((c)++)=(unsigned char)(((l)>> 8L)&0xff))
+
+#undef n2s
+#define n2s(c,l)	(l =((IDEA_INT)(*((c)++)))<< 8L, \
+			 l|=((IDEA_INT)(*((c)++)))      )
+
+#ifdef undef
+/* NOTE - c is not incremented as per c2l */
+#define c2ln(c,l1,l2,n)	{ \
+			c+=n; \
+			l1=l2=0; \
+			switch (n) { \
+			case 8: l2 =((unsigned long)(*(--(c))))<<24; \
+			case 7: l2|=((unsigned long)(*(--(c))))<<16; \
+			case 6: l2|=((unsigned long)(*(--(c))))<< 8; \
+			case 5: l2|=((unsigned long)(*(--(c))));     \
+			case 4: l1 =((unsigned long)(*(--(c))))<<24; \
+			case 3: l1|=((unsigned long)(*(--(c))))<<16; \
+			case 2: l1|=((unsigned long)(*(--(c))))<< 8; \
+			case 1: l1|=((unsigned long)(*(--(c))));     \
+				} \
+			}
+
+/* NOTE - c is not incremented as per l2c */
+#define l2cn(l1,l2,c,n)	{ \
+			c+=n; \
+			switch (n) { \
+			case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \
+			case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \
+			case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \
+			case 5: *(--(c))=(unsigned char)(((l2)    )&0xff); \
+			case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \
+			case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \
+			case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \
+			case 1: *(--(c))=(unsigned char)(((l1)    )&0xff); \
+				} \
+			}
+
+#undef c2s
+#define c2s(c,l)	(l =((unsigned long)(*((c)++)))    , \
+			 l|=((unsigned long)(*((c)++)))<< 8L)
+
+#undef s2c
+#define s2c(l,c)	(*((c)++)=(unsigned char)(((l)     )&0xff), \
+			 *((c)++)=(unsigned char)(((l)>> 8L)&0xff))
+
+#undef c2l
+#define c2l(c,l)	(l =((unsigned long)(*((c)++)))     , \
+			 l|=((unsigned long)(*((c)++)))<< 8L, \
+			 l|=((unsigned long)(*((c)++)))<<16L, \
+			 l|=((unsigned long)(*((c)++)))<<24L)
+
+#undef l2c
+#define l2c(l,c)	(*((c)++)=(unsigned char)(((l)     )&0xff), \
+			 *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>>16L)&0xff), \
+			 *((c)++)=(unsigned char)(((l)>>24L)&0xff))
+#endif
+
+#define E_IDEA(num) \
+	x1&=0xffff; \
+	idea_mul(x1,x1,*p,ul); p++; \
+	x2+= *(p++); \
+	x3+= *(p++); \
+	x4&=0xffff; \
+	idea_mul(x4,x4,*p,ul); p++; \
+	t0=(x1^x3)&0xffff; \
+	idea_mul(t0,t0,*p,ul); p++; \
+	t1=(t0+(x2^x4))&0xffff; \
+	idea_mul(t1,t1,*p,ul); p++; \
+	t0+=t1; \
+	x1^=t1; \
+	x4^=t0; \
+	ul=x2^t0; /* do the swap to x3 */ \
+	x2=x3^t1; \
+	x3=ul;
+
diff --git a/src/lib/libcrypto/md4/md4.h b/src/lib/libcrypto/md4/md4.h
index c3ed9b3f75..a55368a790 100644
--- a/src/lib/libcrypto/md4/md4.h
+++ b/src/lib/libcrypto/md4/md4.h
@@ -105,6 +105,9 @@ typedef struct MD4state_st
 	unsigned int num;
 	} MD4_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD4_Init(MD4_CTX *c);
+#endif
 int MD4_Init(MD4_CTX *c);
 int MD4_Update(MD4_CTX *c, const void *data, size_t len);
 int MD4_Final(unsigned char *md, MD4_CTX *c);
diff --git a/src/lib/libcrypto/md4/md4_dgst.c b/src/lib/libcrypto/md4/md4_dgst.c
index e0c42e8596..82c2cb2d98 100644
--- a/src/lib/libcrypto/md4/md4_dgst.c
+++ b/src/lib/libcrypto/md4/md4_dgst.c
@@ -57,8 +57,9 @@
  */
 
 #include <stdio.h>
-#include "md4_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
+#include "md4_locl.h"
 
 const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@ const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD4_Init(MD4_CTX *c)
+fips_md_init(MD4)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;
diff --git a/src/lib/libcrypto/md5/md5.h b/src/lib/libcrypto/md5/md5.h
index 4cbf84386b..541cc925fe 100644
--- a/src/lib/libcrypto/md5/md5.h
+++ b/src/lib/libcrypto/md5/md5.h
@@ -105,6 +105,9 @@ typedef struct MD5state_st
 	unsigned int num;
 	} MD5_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_MD5_Init(MD5_CTX *c);
+#endif
 int MD5_Init(MD5_CTX *c);
 int MD5_Update(MD5_CTX *c, const void *data, size_t len);
 int MD5_Final(unsigned char *md, MD5_CTX *c);
diff --git a/src/lib/libcrypto/md5/md5_dgst.c b/src/lib/libcrypto/md5/md5_dgst.c
index beace632e3..265890de52 100644
--- a/src/lib/libcrypto/md5/md5_dgst.c
+++ b/src/lib/libcrypto/md5/md5_dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "md5_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
 
@@ -70,7 +71,7 @@ const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT;
 #define INIT_DATA_C (unsigned long)0x98badcfeL
 #define INIT_DATA_D (unsigned long)0x10325476L
 
-int MD5_Init(MD5_CTX *c)
+fips_md_init(MD5)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=INIT_DATA_A;
diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
new file mode 100644
index 0000000000..6358b2750f
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,451 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";	# $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";	# $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";	# $8
+#################
+$Xi="a0";	# $16, input argument block
+$Htbl="a1";
+$inp="a2";
+$len="a3";
+$nlo="a4";	# $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";	# $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";	# $28
+
+{ my $N;
+  sub loop() {
+
+	$N++;
+$code.=<<___;
+.align	4
+	extbl	$Xlo,7,$nlo
+	and	$nlo,0xf0,$nhi
+	sll	$nlo,4,$nlo
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Zlo,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Zhi,0($nlo)
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,6(zero)
+	extbl	$Xlo,6,$nlo
+
+	ldq	$Tlo1,8($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+	ldq	$Thi1,0($nhi)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	and	$nlo,0xf0,$nhi
+
+	xor	$Tlo1,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	xor	$Thi1,$Zhi,$Zhi
+	and	$nlo,0xf0,$nlo
+
+	addq	$nlo,$Htbl,$nlo
+	ldq	$Tlo0,8($nlo)
+	addq	$nhi,$Htbl,$nhi
+	ldq	$Thi0,0($nlo)
+
+.Looplo$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xlo,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Looplo$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	lda	$cnt,7(zero)
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	unop
+
+
+.Loophi$N:
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	subq	$cnt,1,$cnt
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	extbl	$Xhi,$cnt,$nlo
+
+	and	$nlo,0xf0,$nhi
+	xor	$Thi0,$Zhi,$Zhi
+	xor	$Tlo0,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	and	$nlo,0xf0,$nlo
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+	addq	$nlo,$Htbl,$nlo
+	addq	$nhi,$Htbl,$nhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	ldq	$Tlo0,8($nlo)
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	ldq	$Thi0,0($nlo)
+	bne	$cnt,.Loophi$N
+
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	ldq	$Tlo1,8($nhi)
+	xor	$rem,$Zhi,$Zhi
+	ldq	$Thi1,0($nhi)
+	s8addq	$remp,$rem_4bit,$remp
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$t0,$Zlo,$Zlo
+
+	xor	$Tlo0,$Zlo,$Zlo
+	xor	$Thi0,$Zhi,$Zhi
+
+	and	$Zlo,0x0f,$remp
+	sll	$Zhi,60,$t0
+	srl	$Zlo,4,$Zlo
+
+	s8addq	$remp,$rem_4bit,$remp
+	xor	$rem,$Zhi,$Zhi
+
+	ldq	$rem,0($remp)
+	srl	$Zhi,4,$Zhi
+	xor	$Tlo1,$Zlo,$Zlo
+	xor	$Thi1,$Zhi,$Zhi
+	xor	$t0,$Zlo,$Zlo
+	xor	$rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	gcm_gmult_4bit
+.align	4
+.ent	gcm_gmult_4bit
+gcm_gmult_4bit:
+	.frame	sp,0,ra
+	.prologue 0
+
+	ldq	$Xlo,8($Xi)
+	ldq	$Xhi,0($Xi)
+
+	br	$rem_4bit,.Lpic1
+.Lpic1:	lda	$rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	ret	(ra)
+.end	gcm_gmult_4bit
+___
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.align	4
+.ent	gcm_ghash_4bit
+gcm_ghash_4bit:
+	lda	sp,-32(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	.mask	0x04000600,-32
+	.frame	sp,32,ra
+	.prologue 0
+
+	ldq_u	$inhi,0($inp)
+	ldq_u	$Thi0,7($inp)
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+	ldq	$Xhi,0($Xi)
+	ldq	$Xlo,8($Xi)
+
+	br	$rem_4bit,.Lpic2
+.Lpic2:	lda	$rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+
+.Louter:
+	extql	$inhi,$inp,$inhi
+	extqh	$Thi0,$inp,$Thi0
+	or	$inhi,$Thi0,$inhi
+	lda	$inp,16($inp)
+
+	extql	$inlo,$inp,$inlo
+	extqh	$Tlo0,$inp,$Tlo0
+	or	$inlo,$Tlo0,$inlo
+	subq	$len,16,$len
+
+	xor	$Xlo,$inlo,$Xlo
+	xor	$Xhi,$inhi,$Xhi
+___
+
+	&loop();
+
+$code.=<<___;
+	srl	$Zlo,24,$t0	# byte swap
+	srl	$Zlo,8,$t1
+
+	sll	$Zlo,8,$t2
+	sll	$Zlo,24,$Zlo
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+
+	zapnot	$Zlo,0x88,$Zlo
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zlo,$t0,$Zlo
+	srl	$Zhi,24,$t0
+	srl	$Zhi,8,$t1
+
+	or	$Zlo,$t2,$Zlo
+	sll	$Zhi,8,$t2
+	sll	$Zhi,24,$Zhi
+
+	srl	$Zlo,32,$Xlo
+	sll	$Zlo,32,$Zlo
+	beq	$len,.Ldone
+
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+	ldq_u	$inhi,0($inp)
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+	ldq_u	$Thi0,7($inp)
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+	ldq_u	$inlo,8($inp)
+	ldq_u	$Tlo0,15($inp)
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+	br	zero,.Louter
+
+.Ldone:
+	zapnot	$t0,0x11,$t0
+	zapnot	$t1,0x22,$t1
+	or	$Zlo,$Xlo,$Xlo
+
+	zapnot	$Zhi,0x88,$Zhi
+	or	$t0,$t1,$t0
+	zapnot	$t2,0x44,$t2
+
+	or	$Zhi,$t0,$Zhi
+	or	$Zhi,$t2,$Zhi
+
+	srl	$Zhi,32,$Xhi
+	sll	$Zhi,32,$Zhi
+
+	or	$Zhi,$Xhi,$Xhi
+
+	stq	$Xlo,8($Xi)
+	stq	$Xhi,0($Xi)
+
+	.set	noreorder
+	/*ldq	ra,0(sp)*/
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	lda	sp,32(sp)
+	ret	(ra)
+.end	gcm_ghash_4bit
+
+.align	4
+rem_4bit:
+	.quad	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+	.quad	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+	.quad	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+	.quad	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.ascii	"GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
new file mode 100644
index 0000000000..d91586ee29
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl
@@ -0,0 +1,429 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+32 bytes shared table]. There is no
+# experimental performance data available yet. The only approximation
+# that can be made at this point is based on code size. Inner loop is
+# 32 instructions long and on single-issue core should execute in <40
+# cycles. Having verified that gcc 3.4 didn't unroll corresponding
+# loop, this assembler loop body was found to be ~3x smaller than
+# compiler-generated one...
+#
+# July 2010
+#
+# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on
+# Cortex A8 core and ~25 cycles per processed byte (which was observed
+# to be ~3 times faster than gcc-generated code:-)
+#
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+
+# ====================================================================
+# Note about "528B" variant. In ARM case it makes lesser sense to
+# implement it for following reasons:
+#
+# - performance improvement won't be anywhere near 50%, because 128-
+#   bit shift operation is neatly fused with 128-bit xor here, and
+#   "538B" variant would eliminate only 4-5 instructions out of 32
+#   in the inner loop (meaning that estimated improvement is ~15%);
+# - ARM-based systems are often embedded ones and extra memory
+#   consumption might be unappreciated (for so little improvement);
+#
+# Byte order [in]dependence. =========================================
+#
+# Caller is expected to maintain specific *dword* order in Htable,
+# namely with *least* significant dword of 128-bit value at *lower*
+# address. This differs completely from C code and has everything to
+# do with ldm instruction and order in which dwords are "consumed" by
+# algorithm. *Byte* order within these dwords in turn is whatever
+# *native* byte order on current platform. See gcm128.c for working
+# example...
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$Xi="r0";	# argument block
+$Htbl="r1";
+$inp="r2";
+$len="r3";
+
+$Zll="r4";	# variables
+$Zlh="r5";
+$Zhl="r6";
+$Zhh="r7";
+$Tll="r8";
+$Tlh="r9";
+$Thl="r10";
+$Thh="r11";
+$nlo="r12";
+################# r13 is stack pointer
+$nhi="r14";
+################# r15 is program counter
+
+$rem_4bit=$inp;	# used in gcm_gmult_4bit
+$cnt=$len;
+
+sub Zsmash() {
+  my $i=12;
+  my @args=@_;
+  for ($Zll,$Zlh,$Zhl,$Zhh) {
+    $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$_,$_
+	str	$_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+	str	$_,[$Xi,#$i]
+#else
+	mov	$Tlh,$_,lsr#8
+	strb	$_,[$Xi,#$i+3]
+	mov	$Thl,$_,lsr#16
+	strb	$Tlh,[$Xi,#$i+2]
+	mov	$Thh,$_,lsr#24
+	strb	$Thl,[$Xi,#$i+1]
+	strb	$Thh,[$Xi,#$i]
+#endif
+___
+    $code.="\t".shift(@args)."\n";
+    $i-=4;
+  }
+}
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+.type	rem_4bit,%object
+.align	5
+rem_4bit:
+.short	0x0000,0x1C20,0x3840,0x2460
+.short	0x7080,0x6CA0,0x48C0,0x54E0
+.short	0xE100,0xFD20,0xD940,0xC560
+.short	0x9180,0x8DA0,0xA9C0,0xB5E0
+.size	rem_4bit,.-rem_4bit
+
+.type	rem_4bit_get,%function
+rem_4bit_get:
+	sub	$rem_4bit,pc,#8
+	sub	$rem_4bit,$rem_4bit,#32	@ &rem_4bit
+	b	.Lrem_4bit_got
+	nop
+.size	rem_4bit_get,.-rem_4bit_get
+
+.global	gcm_ghash_4bit
+.type	gcm_ghash_4bit,%function
+gcm_ghash_4bit:
+	sub	r12,pc,#8
+	add	$len,$inp,$len		@ $len to point at the end
+	stmdb	sp!,{r3-r11,lr}		@ save $len/end too
+	sub	r12,r12,#48		@ &rem_4bit
+
+	ldmia	r12,{r4-r11}		@ copy rem_4bit ...
+	stmdb	sp!,{r4-r11}		@ ... to stack
+
+	ldrb	$nlo,[$inp,#15]
+	ldrb	$nhi,[$Xi,#15]
+.Louter:
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	add	$Thh,$Htbl,$nhi
+	ldrb	$nlo,[$inp,#14]
+
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[sp,$nhi]		@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	ldrb	$nhi,[$Xi,#14]
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	eor	$nlo,$nlo,$nhi
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+
+.Linner:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[sp,$nlo]		@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$inp,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrplb	$Tll,[$Xi,$cnt]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tlh,[sp,$nhi]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eorpl	$nlo,$nlo,$Tll
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tlh,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Linner
+
+	ldr	$len,[sp,#32]		@ re-load $len/end
+	add	$inp,$inp,#16
+	mov	$nhi,$Zll
+___
+	&Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+$code.=<<___;
+	bne	.Louter
+
+	add	sp,sp,#36
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+
+.global	gcm_gmult_4bit
+.type	gcm_gmult_4bit,%function
+gcm_gmult_4bit:
+	stmdb	sp!,{r4-r11,lr}
+	ldrb	$nlo,[$Xi,#15]
+	b	rem_4bit_get
+.Lrem_4bit_got:
+	and	$nhi,$nlo,#0xf0
+	and	$nlo,$nlo,#0x0f
+	mov	$cnt,#14
+
+	add	$Zhh,$Htbl,$nlo,lsl#4
+	ldmia	$Zhh,{$Zll-$Zhh}	@ load Htbl[nlo]
+	ldrb	$nlo,[$Xi,#14]
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	add	$nhi,$nhi,$nhi
+	eor	$Zll,$Tll,$Zll,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	and	$nhi,$nlo,#0xf0
+	eor	$Zhh,$Zhh,$Tll,lsl#16
+	and	$nlo,$nlo,#0x0f
+
+.Loop:
+	add	$Thh,$Htbl,$nlo,lsl#4
+	and	$nlo,$Zll,#0xf		@ rem
+	subs	$cnt,$cnt,#1
+	add	$nlo,$nlo,$nlo
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nlo]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	ldrh	$Tll,[$rem_4bit,$nlo]	@ rem_4bit[rem]
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	ldrplb	$nlo,[$Xi,$cnt]
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+
+	add	$Thh,$Htbl,$nhi
+	and	$nhi,$Zll,#0xf		@ rem
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	add	$nhi,$nhi,$nhi
+	ldmia	$Thh,{$Tll-$Thh}	@ load Htbl[nhi]
+	eor	$Zll,$Tll,$Zll,lsr#4
+	eor	$Zll,$Zll,$Zlh,lsl#28
+	eor	$Zlh,$Tlh,$Zlh,lsr#4
+	ldrh	$Tll,[$rem_4bit,$nhi]	@ rem_4bit[rem]
+	eor	$Zlh,$Zlh,$Zhl,lsl#28
+	eor	$Zhl,$Thl,$Zhl,lsr#4
+	eor	$Zhl,$Zhl,$Zhh,lsl#28
+	eor	$Zhh,$Thh,$Zhh,lsr#4
+	andpl	$nhi,$nlo,#0xf0
+	andpl	$nlo,$nlo,#0x0f
+	eor	$Zhh,$Zhh,$Tll,lsl#16	@ ^= rem_4bit[rem]
+	bpl	.Loop
+___
+	&Zsmash();
+$code.=<<___;
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+{
+my $cnt=$Htbl;	# $Htbl is used once in the very beginning
+
+my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
+my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
+
+# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
+# in Zo. Or should I say "top bit", because GHASH is specified in
+# reverse bit order? Otherwise straightforward 128-bt H by one input
+# byte multiplication and modulo-reduction, times 16.
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.global	gcm_gmult_neon
+.type	gcm_gmult_neon,%function
+.align	4
+gcm_gmult_neon:
+	sub		$Htbl,#16		@ point at H in GCM128_CTX
+	vld1.64		`&Dhi("$IN")`,[$Xi,:64]!@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$IN")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Htbl,{$Hhi-$Hlo}	@ load H
+	veor		$zero,$zero
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Qpost,$Qpost
+	veor		$R,$R
+	mov		$cnt,#16
+	veor		$Z,$Z
+	mov		$len,#16
+	veor		$Zo,$Zo
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	b		.Linner_neon
+.size	gcm_gmult_neon,.-gcm_gmult_neon
+
+.global	gcm_ghash_neon
+.type	gcm_ghash_neon,%function
+.align	4
+gcm_ghash_neon:
+	vld1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ load Xi
+	vmov.i32	$mod,#0xe1		@ our irreducible polynomial
+	vld1.64		`&Dlo("$Z")`,[$Xi,:64]!
+	vshr.u64	$mod,#32
+	vldmia		$Xi,{$Hhi-$Hlo}		@ load H
+	veor		$zero,$zero
+	nop
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+.Louter_neon:
+	vld1.64		`&Dhi($IN)`,[$inp]!	@ load inp
+	veor		$Qpost,$Qpost
+	vld1.64		`&Dlo($IN)`,[$inp]!
+	veor		$R,$R
+	mov		$cnt,#16
+#ifdef __ARMEL__
+	vrev64.8	$IN,$IN
+#endif
+	veor		$Zo,$Zo
+	veor		$IN,$Z			@ inp^=Xi
+	veor		$Z,$Z
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+.Linner_neon:
+	subs		$cnt,$cnt,#1
+	vmull.p8	$Qlo,$Hlo,$xi		@ H.lo�Xi[i]
+	vmull.p8	$Qhi,$Hhi,$xi		@ H.hi�Xi[i]
+	vext.8		$IN,$zero,#1		@ IN>>=8
+
+	veor		$Z,$Qpost		@ modulo-scheduled part
+	vshl.i64	`&Dlo("$R")`,#48
+	vdup.8		$xi,`&Dlo("$IN")`[0]	@ broadcast lowest byte
+	veor		$T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
+
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+	vuzp.8		$Qlo,$Qhi
+	vsli.8		$Zo,$T,#1		@ compose the "carry" byte
+	vext.8		$Z,$zero,#1		@ Z>>=8
+
+	vmull.p8	$R,$Zo,$mod		@ "carry"�0xe1
+	vshr.u8		$Zo,$T,#7		@ save Z's bottom bit
+	vext.8		$Qpost,$Qlo,$zero,#1	@ Qlo>>=8
+	veor		$Z,$Qhi
+	bne		.Linner_neon
+
+	veor		$Z,$Qpost		@ modulo-scheduled artefact
+	vshl.i64	`&Dlo("$R")`,#48
+	veor		`&Dhi("$Z")`,`&Dlo("$R")`
+
+	@ finalization, normalize Z:Zo
+	vand		$Zo,$mod		@ suffices to mask the bit
+	vshr.u64	`&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
+	vshl.i64	$Z,#1
+	subs		$len,#16
+	vorr		$Z,`&Q("$Zo")`		@ Z=Z:Zo<<1
+	bne		.Louter_neon
+
+#ifdef __ARMEL__
+	vrev64.8	$Z,$Z
+#endif
+	sub		$Xi,#16	
+	vst1.64		`&Dhi("$Z")`,[$Xi,:64]!	@ write out Xi
+	vst1.64		`&Dlo("$Z")`,[$Xi,:64]
+
+	bx	lr
+.size	gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align  2
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT; # enforce flush
diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
new file mode 100755
index 0000000000..0354c95444
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-ia64.pl
@@ -0,0 +1,463 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.67 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. To anchor to something else sha1-ia64.pl module processes one
+# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per
+# byte.
+
+# September 2010
+#
+# It was originally thought that it makes lesser sense to implement
+# "528B" variant on Itanium 2 for following reason. Because number of
+# functional units is naturally limited, it appeared impossible to
+# implement "528B" loop in 4 cycles, only in 5. This would mean that
+# theoretically performance improvement couldn't be more than 20%.
+# But occasionally you prove yourself wrong:-) I figured out a way to
+# fold couple of instructions and having freed yet another instruction
+# slot by unrolling the loop... Resulting performance is 4.45 cycles
+# per processed byte and 50% better than "256B" version. On original
+# Itanium performance should remain the same as the "256B" version,
+# i.e. ~8.5 cycles.
+
+$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+for (@ARGV)  {  $big_endian=1 if (/\-DB_ENDIAN/);
+                $big_endian=0 if (/\-DL_ENDIAN/);  }
+if (!defined($big_endian))
+             {  $big_endian=(unpack('L',pack('N',1))==1);  }
+
+sub loop() {
+my $label=shift;
+my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp
+
+# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e.
+# in scalable manner;-) Naturally assuming data in L1 cache...
+# Special note about 'dep' instruction, which is used to construct
+# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128
+# bytes boundary and lower 7 bits of its address are guaranteed to
+# be zero.
+$code.=<<___;
+$label:
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p19)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p19)	xor	Zhi=Zhi,Hhi
+	($p17)	xor	xi[1]=xi[1],in[1]	};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p19)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p19)	ld8	rem=[rem]
+	(p18)	and	Hi[1]=mask0xf0,xi[2]	};;
+{ .mmi;	($p16)	ld1	in[0]=[inp],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p19)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p19)	xor	Hhi=Hhi,rem
+	(p18)	add	Hi[1]=Htbl,Hi[1]	};;
+
+{ .mfi;	(p18)	ld8	Hlo=[Hi[1]],-8
+	(p18)	dep	rem=Zlo,rem_4bitp,3,4	}
+{ .mfi;	(p17)	shladd	Hi[0]=xi[1],4,r0
+	(p18)	xor	Zhi=Zhi,Hhi		};;
+{ .mfi;	(p18)	ld8	Hhi=[Hi[1]]
+	(p18)	shrp	Zlo=Zhi,Zlo,4		}
+{ .mfi;	(p18)	ld8	rem=[rem]
+	(p17)	and	Hi[0]=mask0xf0,Hi[0]	};;
+{ .mmi;	(p16)	ld1	xi[0]=[Xi],-1
+	(p18)	xor	Zlo=Zlo,Hlo
+	(p18)	shr.u	Zhi=Zhi,4		}
+{ .mib;	(p18)	xor	Hhi=Hhi,rem
+	(p17)	add	Hi[0]=Htbl,Hi[0]
+	br.ctop.sptk	$label			};;
+___
+}
+
+$code=<<___;
+.explicit
+.text
+
+prevfs=r2;	prevlc=r3;	prevpr=r8;
+mask0xf0=r21;
+rem=r22;	rem_4bitp=r23;
+Xi=r24;		Htbl=r25;
+inp=r26;	end=r27;
+Hhi=r28;	Hlo=r29;
+Zhi=r30;	Zlo=r31;
+
+.align	128
+.skip	16					// aligns loop body
+.global	gcm_gmult_4bit#
+.proc	gcm_gmult_4bit#
+gcm_gmult_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,2,6,0,8
+	$ADDP	Xi=15,in0			// &Xi[15]
+	mov	rem_4bitp=ip		}
+{ .mii;	$ADDP	Htbl=8,in1			// &Htbl[0].lo
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotr	in[3],xi[3],Hi[2]
+
+{ .mib;	ld1	xi[2]=[Xi],-1			// Xi[15]
+	mov	mask0xf0=0xf0
+	brp.loop.imp	.Loop1,.Lend1-16};;
+{ .mmi;	ld1	xi[1]=[Xi],-1			// Xi[14]
+					};;
+{ .mii;	shladd	Hi[1]=xi[2],4,r0
+	mov	pr.rot=0x7<<16
+	mov	ar.lc=13		};;
+{ .mii;	and	Hi[1]=mask0xf0,Hi[1]
+	mov	ar.ec=3
+	xor	Zlo=Zlo,Zlo		};;
+{ .mii;	add	Hi[1]=Htbl,Hi[1]		// &Htbl[nlo].lo
+	add	rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp
+	xor	Zhi=Zhi,Zhi		};;
+___
+	&loop	(".Loop1",1);
+$code.=<<___;
+.Lend1:
+{ .mib;	xor	Zhi=Zhi,Hhi		};;	// modulo-scheduling artefact
+{ .mib;	mux1	Zlo=Zlo,\@rev		};;
+{ .mib;	mux1	Zhi=Zhi,\@rev		};;
+{ .mmi;	add	Hlo=9,Xi;;			// ;; is here to prevent
+	add	Hhi=1,Xi		};;	// pipeline flush on Itanium
+{ .mib;	st8	[Hlo]=Zlo
+	mov	pr=prevpr,0x1ffff	};;
+{ .mib;	st8	[Hhi]=Zhi
+	mov	ar.lc=prevlc
+	br.ret.sptk.many	b0	};;
+.endp	gcm_gmult_4bit#
+___
+
+######################################################################
+# "528B" (well, "512B" actualy) streamed GHASH
+#
+$Xip="in0";
+$Htbl="in1";
+$inp="in2";
+$len="in3";
+$rem_8bit="loc0";
+$mask0xff="loc1";
+($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum");
+
+sub load_htable() {
+    for (my $i=0;$i<8;$i++) {
+	$code.=<<___;
+{ .mmi;	ld8	r`16+2*$i+1`=[r8],16		// Htable[$i].hi
+	ld8	r`16+2*$i`=[r9],16	}	// Htable[$i].lo
+{ .mmi;	ldf8	f`32+2*$i+1`=[r10],16		// Htable[`8+$i`].hi
+	ldf8	f`32+2*$i`=[r11],16		// Htable[`8+$i`].lo
+___
+	$code.=shift	if (($i+$#_)==7);
+	$code.="\t};;\n"
+    }
+}
+
+$code.=<<___;
+prevsp=r3;
+
+.align	32
+.skip	16					// aligns loop body
+.global	gcm_ghash_4bit#
+.proc	gcm_ghash_4bit#
+gcm_ghash_4bit:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,4,2,0,0
+	.vframe	prevsp
+	mov	prevsp=sp
+	mov	$rem_8bit=ip		};;
+	.body
+{ .mfi;	$ADDP	r8=0+0,$Htbl
+	$ADDP	r9=0+8,$Htbl		}
+{ .mfi;	$ADDP	r10=128+0,$Htbl
+	$ADDP	r11=128+8,$Htbl		};;
+___
+	&load_htable(
+	"	$ADDP	$Xip=15,$Xip",		# &Xi[15]
+	"	$ADDP	$len=$len,$inp",	# &inp[len]
+	"	$ADDP	$inp=15,$inp",		# &inp[15]
+	"	mov	$mask0xff=0xff",
+	"	add	sp=-512,sp",
+	"	andcm	sp=sp,$mask0xff",	# align stack frame
+	"	add	r14=0,sp",
+	"	add	r15=8,sp");
+$code.=<<___;
+{ .mmi;	$sum	1<<1				// go big-endian
+	add	r8=256+0,sp
+	add	r9=256+8,sp		}
+{ .mmi;	add	r10=256+128+0,sp
+	add	r11=256+128+8,sp
+	add	$len=-17,$len		};;
+___
+for($i=0;$i<8;$i++) {	# generate first half of Hshr4[]
+my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1));
+$code.=<<___;
+{ .mmi;	st8	[r8]=$rlo,16			// Htable[$i].lo
+	st8	[r9]=$rhi,16			// Htable[$i].hi
+	shrp	$rlo=$rhi,$rlo,4	}//;;
+{ .mmi;	stf8	[r10]=f`32+2*$i`,16		// Htable[`8+$i`].lo
+	stf8	[r11]=f`32+2*$i+1`,16		// Htable[`8+$i`].hi
+	shr.u	$rhi=$rhi,4		};;
+{ .mmi;	st8	[r14]=$rlo,16			// Htable[$i].lo>>4
+	st8	[r15]=$rhi,16		}//;;	// Htable[$i].hi>>4
+___
+}
+$code.=<<___;
+{ .mmi;	ld8	r16=[r8],16			// Htable[8].lo
+	ld8	r17=[r9],16		};;	// Htable[8].hi
+{ .mmi;	ld8	r18=[r8],16			// Htable[9].lo
+	ld8	r19=[r9],16		}	// Htable[9].hi
+{ .mmi;	rum	1<<5				// clear um.mfh
+	shrp	r16=r17,r16,4		};;
+___
+for($i=0;$i<6;$i++) {	# generate second half of Hshr4[]
+$code.=<<___;
+{ .mmi;	ld8	r`20+2*$i`=[r8],16		// Htable[`10+$i`].lo
+	ld8	r`20+2*$i+1`=[r9],16		// Htable[`10+$i`].hi
+	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+___
+}
+$code.=<<___;
+{ .mmi;	shr.u	r`16+2*$i+1`=r`16+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`16+2*$i`,16		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`16+2*$i+1`,16		// Htable[`8+$i`].hi>>4
+	shrp	r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4	}
+{ .mmi;	add	$Htbl=256,sp			// &Htable[0]
+	add	$rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit
+	shr.u	r`18+2*$i+1`=r`18+2*$i+1`,4	};;
+{ .mmi;	st8	[r14]=r`18+2*$i`		// Htable[`8+$i`].lo>>4
+	st8	[r15]=r`18+2*$i+1`	}	// Htable[`8+$i`].hi>>4
+___
+
+$in="r15";
+@xi=("r16","r17");
+@rem=("r18","r19");
+($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25");
+($Atbl,$Btbl)=("r26","r27");
+
+$code.=<<___;	# (p16)
+{ .mmi;	ld1	$in=[$inp],-1			//(p16) *inp--
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	cmp.eq	p0,p6=r0,r0		};;	//	clear p6
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17)
+{ .mmi;	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mii;	ld1	$in=[$inp],-1			//(p16) *inp--
+	dep	$Atbl=$xi[1],$Htbl,4,4		//(p17) &Htable[nlo].lo
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+.align	32
+.LOOP:
+{ .mmi;
+(p6)	st8	[$Xip]=$Zhi,13
+	xor	$Zlo=$Zlo,$Zlo
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi].lo
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p16),(p17),(p18)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mfi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo		};;	//(p18) Z.lo^=Htable[nlo].lo
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1		}	//(p16) *inp--
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	mov	$Zhi=$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+for ($i=1;$i<14;$i++) {
+# Above and below fragments are derived from this one by removing
+# unsuitable (p??) instructions.
+$code.=<<___;	# (p16),(p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	}	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	ld1	$in=[$inp],-1			//(p16) *inp--
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	ld1	$xi[0]=[$Xip],-1		//(p16) *Xi--
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+}
+
+$code.=<<___;	# (p17),(p18),(p19)
+{ .mmi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	ld8	$rem[0]=[$Btbl],-256		//(p18) Htable[nhi].lo,&Hshr4[nhi].lo
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	xor	$xi[1]=$xi[1],$in	};;	//(p17) xi=$xi[i]^inp[i]
+{ .mmi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	dep	$Atbl=$xi[1],$Htbl,4,4	};;	//(p17) &Htable[nlo].lo
+{ .mmi;	shladd	$rem[0]=$rem[0],4,r0		//(p18) Htable[nhi].lo<<4
+	xor	$Zlo=$Zlo,$Alo			//(p18) Z.lo^=Htable[nlo].lo
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mmi;	ld8	$Blo=[$Btbl],8			//(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mmi;	xor	$rem[0]=$rem[0],$Zlo		//(p18) Z.lo^(Htable[nhi].lo<<4)
+	xor	$Zhi=$Zhi,$Ahi			//(p18) Z.hi^=Htable[nlo].hi
+	and	$xi[1]=-16,$xi[1]	};;	//(p17) nhi=xi&0xf0
+{ .mmi;	ld8	$Bhi=[$Btbl]			//(p18) Hshr4[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,8	}	//(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8)
+{ .mmi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+	add	$Btbl=$xi[1],$Htbl	};;	//(p17) &Htable[nhi]
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p18),(p19)
+{ .mfi;	ld8	$Alo=[$Atbl],8			//(p18) Htable[nlo].lo,&Htable[nlo].hi
+	shr.u	$Zhi=$Zhi,8		}	//(p19) Z.hi>>=8
+{ .mfi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo		};;	//(p19) Z.lo^=Hshr4[nhi].lo
+{ .mfi;	ld8	$Ahi=[$Atbl]			//(p18) Htable[nlo].hi
+	xor	$Zlo=$Zlo,$Alo		}	//(p18) Z.lo^=Htable[nlo].lo
+{ .mfi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+	xor	$Zhi=$Zhi,$Bhi		};;	//(p19) Z.hi^=Hshr4[nhi].hi
+{ .mfi;	ld8	$Blo=[$Btbl],8			//(p18) Htable[nhi].lo,&Htable[nhi].hi
+	shl	$rem[1]=$rem[1],48	}	//(p19) rem_8bit[rem]<<48
+{ .mfi;	shladd	$rem[0]=$Zlo,4,r0		//(p18) Z.lo<<4
+	xor	$Zhi=$Zhi,$Ahi		};;	//(p18) Z.hi^=Htable[nlo].hi
+{ .mfi;	ld8	$Bhi=[$Btbl]			//(p18) Htable[nhi].hi
+	shrp	$Zlo=$Zhi,$Zlo,4	}	//(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4)
+{ .mfi;	and	$rem[0]=$rem[0],$mask0xff	//(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff
+	xor	$Zhi=$Zhi,$rem[1]	};;	//(p19) Z.hi^=rem_8bit[rem]<<48
+___
+push (@xi,shift(@xi)); push (@rem,shift(@rem));	# "rotate" registers
+
+$code.=<<___;	# (p19)
+{ .mmi;	cmp.ltu	p6,p0=$inp,$len
+	add	$inp=32,$inp
+	shr.u	$Zhi=$Zhi,4		}	//(p19) Z.hi>>=4
+{ .mmi;	shladd	$rem[1]=$rem[1],1,$rem_8bit	//(p19) &rem_8bit[rem]
+	xor	$Zlo=$Zlo,$Blo			//(p19) Z.lo^=Hshr4[nhi].lo
+	add	$Xip=9,$Xip		};;	//	&Xi.lo
+{ .mmi;	ld2	$rem[1]=[$rem[1]]		//(p19) rem_8bit[rem]
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+(p6)	extr.u	$xi[1]=$Zlo,8,8		}	//[p17] Xi[14]
+{ .mmi;	xor	$Zhi=$Zhi,$Bhi			//(p19) Z.hi^=Hshr4[nhi].hi
+(p6)	and	$xi[0]=$Zlo,$mask0xff	};;	//[p16] Xi[15]
+{ .mmi;	st8	[$Xip]=$Zlo,-8
+(p6)	xor	$xi[0]=$xi[0],$in		//[p17] xi=$xi[i]^inp[i]
+	shl	$rem[1]=$rem[1],48	};;	//(p19) rem_8bit[rem]<<48
+{ .mmi;
+(p6)	ld1	$in=[$inp],-1			//[p16] *inp--
+	xor	$Zhi=$Zhi,$rem[1]		//(p19) Z.hi^=rem_8bit[rem]<<48
+(p6)	dep	$Atbl=$xi[0],$Htbl,4,4	}	//[p17] &Htable[nlo].lo
+{ .mib;
+(p6)	and	$xi[0]=-16,$xi[0]		//[p17] nhi=xi&0xf0
+(p6)	br.cond.dptk.many	.LOOP	};;
+
+{ .mib;	st8	[$Xip]=$Zhi		};;
+{ .mib;	$rum	1<<1				// return to little-endian
+	.restore	sp
+	mov	sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	gcm_ghash_4bit#
+___
+$code.=<<___;
+.align	128
+.type	rem_4bit#,\@object
+rem_4bit:
+        data8	0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+        data8	0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+        data8	0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+        data8	0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.size	rem_4bit#,128
+.type	rem_8bit#,\@object
+rem_8bit:
+	data1	0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E
+	data1	0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E
+	data1	0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E
+	data1	0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E
+	data1	0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E
+	data1	0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E
+	data1	0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E
+	data1	0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E
+	data1	0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE
+	data1	0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE
+	data1	0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE
+	data1	0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE
+	data1	0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E
+	data1	0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E
+	data1	0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE
+	data1	0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE
+	data1	0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E
+	data1	0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E
+	data1	0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E
+	data1	0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E
+	data1	0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E
+	data1	0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E
+	data1	0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E
+	data1	0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E
+	data1	0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE
+	data1	0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE
+	data1	0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE
+	data1	0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE
+	data1	0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E
+	data1	0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E
+	data1	0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE
+	data1	0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE
+.size	rem_8bit#,512
+stringz	"GHASH for IA64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm      if ($big_endian);
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
new file mode 100644
index 0000000000..8c7454ee93
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-parisc.pl
@@ -0,0 +1,730 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# April 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC
+# it processes one byte in 19.6 cycles, which is more than twice as
+# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for
+# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per
+# processed byte. This is ~2.2x faster than 64-bit code generated by
+# vendor compiler (which used to be very hard to beat:-).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$NREGS		=6;
+} else {
+	$LEVEL		="1.0";	#"\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$NREGS		=11;
+}
+
+$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker
+				#                 [+ argument transfer]
+
+################# volatile registers
+$Xi="%r26";	# argument block
+$Htbl="%r25";
+$inp="%r24";
+$len="%r23";
+$Hhh=$Htbl;	# variables
+$Hll="%r22";
+$Zhh="%r21";
+$Zll="%r20";
+$cnt="%r19";
+$rem_4bit="%r28";
+$rem="%r29";
+$mask0xf0="%r31";
+
+################# preserved registers
+$Thh="%r1";
+$Tll="%r2";
+$nlo="%r3";
+$nhi="%r4";
+$byte="%r5";
+if ($SIZE_T==4) {
+	$Zhl="%r6";
+	$Zlh="%r7";
+	$Hhl="%r8";
+	$Hlh="%r9";
+	$Thl="%r10";
+	$Tlh="%r11";
+}
+$rem2="%r6";	# used in PA-RISC 2.0 code
+
+$code.=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	64
+gcm_gmult_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_gmult
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_gmult
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_gmult_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+	ldbx	$cnt($Xi),$nlo
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$Tll,$Zll,$Zll
+	addib,uv -1,$cnt,L\$oop_gmult_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	std	$Zhh,0($Xi)
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_gmult
+	nop
+
+L\$parisc1_gmult
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_gmult_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_gmult_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_gmult_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	stw	$Zhh,0($Xi)
+___
+$code.=<<___;
+L\$done_gmult
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.EXPORT	gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+gcm_ghash_4bit
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+___
+$code.=<<___ if ($SIZE_T==4);
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+___
+$code.=<<___;
+	blr	%r0,$rem_4bit
+	ldi	3,$rem
+L\$pic_ghash
+	andcm	$rem_4bit,$rem,$rem_4bit
+	addl	$inp,$len,$len
+	ldo	L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit
+	ldi	0xf0,$mask0xf0
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldi	31,$rem
+	mtctl	$rem,%cr11
+	extrd,u,*= $rem,%sar,1,$rem	; executes on PA-RISC 1.0
+	b	L\$parisc1_ghash
+	nop
+___
+
+$code.=<<___;
+	ldb	15($Xi),$nlo
+	ldo	8($Htbl),$Hll
+
+L\$outer_ghash_pa2
+	ldb	15($inp),$nhi
+	xor	$nhi,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	ldd	$nlo($Hll),$Zll
+	ldd	$nlo($Hhh),$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+	b	L\$oop_ghash_pa2
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa2
+	xor	$rem,$Zhh,$Zhh		; moved here to work around gas bug
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldbx	$cnt($Xi),$nlo
+	ldbx	$cnt($inp),$byte
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	xor	$byte,$nlo,$nlo
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	and	$mask0xf0,$nlo,$nhi
+	depd,z	$nlo,59,4,$nlo
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+
+	ldd	$rem($rem_4bit),$rem
+	addib,uv -1,$cnt,L\$oop_ghash_pa2
+	xor	$Thh,$Zhh,$Zhh
+
+	xor	$rem,$Zhh,$Zhh
+	depd,z	$Zll,60,4,$rem2
+
+	shrpd	$Zhh,$Zll,4,$Zll
+	extrd,u	$Zhh,59,60,$Zhh
+	ldd	$nlo($Hll),$Tll
+	ldd	$nlo($Hhh),$Thh
+
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+
+	depd,z	$Zll,60,4,$rem
+	shrpd	$Zhh,$Zll,4,$Zll
+	ldd	$rem2($rem_4bit),$rem2
+
+	xor	$rem2,$Zhh,$Zhh
+	ldd	$nhi($Hll),$Tll
+	ldd	$nhi($Hhh),$Thh
+
+	extrd,u	$Zhh,59,60,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Thh,$Zhh,$Zhh
+	ldd	$rem($rem_4bit),$rem
+
+	xor	$rem,$Zhh,$Zhh
+	std	$Zll,8($Xi)
+	ldo	16($inp),$inp
+	std	$Zhh,0($Xi)
+	cmpb,*<> $inp,$len,L\$outer_ghash_pa2
+	copy	$Zll,$nlo
+___
+
+$code.=<<___ if ($SIZE_T==4);
+	b	L\$done_ghash
+	nop
+
+L\$parisc1_ghash
+	ldb	15($Xi),$nlo
+	ldo	12($Htbl),$Hll
+	ldo	8($Htbl),$Hlh
+	ldo	4($Htbl),$Hhl
+
+L\$outer_ghash_pa1
+	ldb	15($inp),$byte
+	xor	$byte,$nlo,$nlo
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	ldwx	$nlo($Hll),$Zll
+	ldwx	$nlo($Hlh),$Zlh
+	ldwx	$nlo($Hhl),$Zhl
+	ldwx	$nlo($Hhh),$Zhh
+	zdep	$Zll,28,4,$rem
+	ldb	14($Xi),$nlo
+	ldb	14($inp),$byte
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	extru	$Zhh,27,28,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	xor	$byte,$nlo,$nlo
+	xor	$rem,$Zhh,$Zhh
+	and	$mask0xf0,$nlo,$nhi
+	zdep	$nlo,27,4,$nlo
+
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$Thl,$Zhl,$Zhl
+	b	L\$oop_ghash_pa1
+	ldi	13,$cnt
+
+	.ALIGN	8
+L\$oop_ghash_pa1
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	ldbx	$cnt($Xi),$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	ldbx	$cnt($inp),$byte
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$rem,$Zhh,$Zhh
+	zdep	$Zll,28,4,$rem
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$byte,$nlo,$nlo
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	and	$mask0xf0,$nlo,$nhi
+	extru	$Zhh,27,28,$Zhh
+	zdep	$nlo,27,4,$nlo
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nlo($Hll),$Tll
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nlo($Hlh),$Tlh
+	xor	$rem,$Zhh,$Zhh
+	addib,uv -1,$cnt,L\$oop_ghash_pa1
+	xor	$Thl,$Zhl,$Zhl
+
+	zdep	$Zll,28,4,$rem
+	ldwx	$nlo($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	ldwx	$nlo($Hhh),$Thh
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	xor	$Tll,$Zll,$Zll
+	ldwx	$nhi($Hll),$Tll
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	xor	$Tlh,$Zlh,$Zlh
+	ldwx	$nhi($Hlh),$Tlh
+	extru	$Zhh,27,28,$Zhh
+	xor	$rem,$Zhh,$Zhh
+	xor	$Thl,$Zhl,$Zhl
+	ldwx	$nhi($Hhl),$Thl
+	xor	$Thh,$Zhh,$Zhh
+	ldwx	$nhi($Hhh),$Thh
+	zdep	$Zll,28,4,$rem
+	ldwx	$rem($rem_4bit),$rem
+	shrpw	$Zlh,$Zll,4,$Zll
+	shrpw	$Zhl,$Zlh,4,$Zlh
+	shrpw	$Zhh,$Zhl,4,$Zhl
+	extru	$Zhh,27,28,$Zhh
+	xor	$Tll,$Zll,$Zll
+	xor	$Tlh,$Zlh,$Zlh
+	xor	$rem,$Zhh,$Zhh
+	stw	$Zll,12($Xi)
+	xor	$Thl,$Zhl,$Zhl
+	stw	$Zlh,8($Xi)
+	xor	$Thh,$Zhh,$Zhh
+	stw	$Zhl,4($Xi)
+	ldo	16($inp),$inp
+	stw	$Zhh,0($Xi)
+	comb,<>	$inp,$len,L\$outer_ghash_pa1
+	copy	$Zll,$nlo
+___
+$code.=<<___;
+L\$done_ghash
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+___
+$code.=<<___ if ($SIZE_T==4);
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+___
+$code.=<<___;
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+
+	.ALIGN	64
+L\$rem_4bit
+	.WORD	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.WORD	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.WORD	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.WORD	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+	.STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>"
+	.ALIGN	64
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $depd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "depd$mod\t$args";
+
+    # I only have ",z" completer, it's impicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 16
+    {	my $opcode=(0x3c<<26)|($4<<21)|($1<<16);
+    	my $cpos=63-$2;
+	my $len=32-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	if ($SIZE_T==4) {
+		s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e;
+		s/cmpb,\*/comb,/;
+		s/,\*/,/;
+	}
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
new file mode 100644
index 0000000000..6a40d5d89c
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-s390x.pl
@@ -0,0 +1,262 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# September 2010.
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# was measured to be ~18 cycles per processed byte on z10, which is
+# almost 40% better than gcc-generated code. It should be noted that
+# 18 cycles is worse result than expected: loop is scheduled for 12
+# and the result should be close to 12. In the lack of instruction-
+# level profiling data it's impossible to tell why...
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$softonly=0;
+
+$Zhi="%r0";
+$Zlo="%r1";
+
+$Xi="%r2";	# argument block
+$Htbl="%r3";
+$inp="%r4";
+$len="%r5";
+
+$rem0="%r6";	# variables
+$rem1="%r7";
+$nlo="%r8";
+$nhi="%r9";
+$xi="%r10";
+$cnt="%r11";
+$tmp="%r12";
+$x78="%r13";
+$rem_4bit="%r14";
+
+$sp="%r15";
+
+$code.=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+___
+$code.=<<___ if(!$softonly && 0);	# hardware is slow for single block...
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_gmult
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_gmult
+	stg	%r0,16($sp)	# arrange 16 bytes of zero input
+	stg	%r0,24($sp)
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	la	$inp,16($sp)
+	lghi	$len,16
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_gmult:
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	lghi	$len,1
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	j	.Lgmult_shortcut
+.type	gcm_gmult_4bit,\@function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+___
+$code.=<<___ if(!$softonly);
+	larl	%r1,OPENSSL_s390xcap_P
+	lg	%r0,0(%r1)
+	tmhl	%r0,0x4000	# check for message-security-assist
+	jz	.Lsoft_ghash
+	lghi	%r0,0
+	la	%r1,16($sp)
+	.long	0xb93e0004	# kimd %r0,%r4
+	lg	%r1,24($sp)
+	tmhh	%r1,0x4000	# check for function 65
+	jz	.Lsoft_ghash
+	lghi	%r0,65		# function 65
+	la	%r1,0($Xi)	# H lies right after Xi in gcm128_context
+	.long	0xb93e0004	# kimd %r0,$inp
+	brc	1,.-4		# pay attention to "partial completion"
+	br	%r14
+.align	32
+.Lsoft_ghash:
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
+	stm${g}	%r6,%r14,6*$SIZE_T($sp)
+
+	aghi	$Xi,-1
+	srlg	$len,$len,4
+	lghi	$x78,`0xf<<3`
+	larl	$rem_4bit,rem_4bit
+
+	lg	$Zlo,8+1($Xi)		# Xi
+	lg	$Zhi,0+1($Xi)
+	lghi	$tmp,0
+.Louter:
+	xg	$Zhi,0($inp)		# Xi ^= inp 
+	xg	$Zlo,8($inp)
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+
+.Lgmult_shortcut:
+	lghi	$tmp,0xf0
+	sllg	$nlo,$Zlo,4
+	srlg	$xi,$Zlo,8		# extract second byte
+	ngr	$nlo,$tmp
+	lgr	$nhi,$Zlo
+	lghi	$cnt,14
+	ngr	$nhi,$tmp
+
+	lg	$Zlo,8($nlo,$Htbl)
+	lg	$Zhi,0($nlo,$Htbl)
+
+	sllg	$nlo,$xi,4
+	sllg	$rem0,$Zlo,3
+	ngr	$nlo,$tmp
+	ngr	$rem0,$x78
+	ngr	$xi,$tmp
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	j	.Lghash_inner
+.align	16
+.Lghash_inner:
+	srlg	$Zlo,$Zlo,4
+	sllg	$tmp,$Zhi,60
+	xg	$Zlo,8($nlo,$Htbl)
+	srlg	$Zhi,$Zhi,4
+	llgc	$xi,0($cnt,$Xi)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$nlo,$xi,4
+	xg	$Zhi,0($rem0,$rem_4bit)
+	nill	$nlo,0xf0
+	sllg	$rem0,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem0,$x78
+	nill	$xi,0xf0
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	lgr	$nhi,$xi
+	xg	$Zhi,0($rem1,$rem_4bit)
+	sllg	$rem1,$Zlo,3
+	xgr	$Zlo,$tmp
+	ngr	$rem1,$x78
+	brct	$cnt,.Lghash_inner
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nlo,$Htbl)
+	xg	$Zhi,0($nlo,$Htbl)
+	sllg	$xi,$Zlo,3
+	xg	$Zhi,0($rem0,$rem_4bit)
+	xgr	$Zlo,$tmp
+	ngr	$xi,$x78
+
+	sllg	$tmp,$Zhi,60
+	srlg	$Zlo,$Zlo,4
+	srlg	$Zhi,$Zhi,4
+	xg	$Zlo,8($nhi,$Htbl)
+	xg	$Zhi,0($nhi,$Htbl)
+	xgr	$Zlo,$tmp
+	xg	$Zhi,0($rem1,$rem_4bit)
+
+	lg	$tmp,0($xi,$rem_4bit)
+	la	$inp,16($inp)
+	sllg	$tmp,$tmp,4		# correct last rem_4bit[rem]
+	brctg	$len,.Louter
+
+	xgr	$Zhi,$tmp
+	stg	$Zlo,8+1($Xi)
+	stg	$Zhi,0+1($Xi)
+	lm${g}	%r6,%r14,6*$SIZE_T($sp)
+	br	%r14
+.type	gcm_ghash_4bit,\@function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0
+	.long	`0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0
+	.long	`0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0
+	.long	`0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0
+.type	rem_4bit,\@object
+.size	rem_4bit,(.-rem_4bit)
+.string	"GHASH for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
new file mode 100644
index 0000000000..70e7b044a3
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl
@@ -0,0 +1,330 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Performance
+# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
+# and are expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.3.x	cc 5.2		this assembler
+#
+# 32-bit build	81.4		43.3		12.6	(+546%/+244%)
+# 64-bit build	20.2		21.2		12.6	(+60%/+68%)
+#
+# Here is data collected on UltraSPARC T1 system running Linux:
+#
+#		gcc 4.4.1			this assembler
+#
+# 32-bit build	566				50	(+1000%)
+# 64-bit build	56				50	(+12%)
+#
+# I don't quite understand why difference between 32-bit and 64-bit
+# compiler-generated code is so big. Compilers *were* instructed to
+# generate code for UltraSPARC and should have used 64-bit registers
+# for Z vector (see C code) even in 32-bit build... Oh well, it only
+# means more impressive improvement coefficients for this assembler
+# module;-) Loops are aggressively modulo-scheduled in respect to
+# references to input data and Z.hi updates to achieve 12 cycles
+# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
+# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$output=shift;
+open STDOUT,">$output";
+
+$Zhi="%o0";	# 64-bit values
+$Zlo="%o1";
+$Thi="%o2";
+$Tlo="%o3";
+$rem="%o4";
+$tmp="%o5";
+
+$nhi="%l0";	# small values and pointers
+$nlo="%l1";
+$xi0="%l2";
+$xi1="%l3";
+$rem_4bit="%l4";
+$remi="%l5";
+$Htblo="%l6";
+$cnt="%l7";
+
+$Xi="%i0";	# input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
+
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+
+.align	64
+rem_4bit:
+	.long	`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
+	.long	`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
+	.long	`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
+	.long	`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
+.type	rem_4bit,#object
+.size	rem_4bit,(.-rem_4bit)
+
+.globl	gcm_ghash_4bit
+.align	32
+gcm_ghash_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$inp+15],$nlo
+	ldub	[$Xi+15],$xi0
+	ldub	[$Xi+14],$xi1
+	add	$len,$inp,$len
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+.Louter:
+	xor	$xi0,$nlo,$nlo
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$inp+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	xor	$xi1,$nlo,$nlo
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lghash_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lghash_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	ldub	[$Xi+$cnt],$xi1
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$xi1,$nlo,$nlo
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lghash_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+
+	add	$inp,16,$inp
+	cmp	$inp,$len
+	be,pn	`$bits==64?"%xcc":"%icc"`,.Ldone
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$inp+15],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+	srl	$Zlo,8,$xi1
+	and	$Zlo,0xff,$xi0
+	ba	.Louter
+	and	$xi1,0xff,$xi1
+.align	32
+.Ldone:
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_ghash_4bit,#function
+.size	gcm_ghash_4bit,(.-gcm_ghash_4bit)
+___
+
+undef $inp;
+undef $len;
+
+$code.=<<___;
+.globl	gcm_gmult_4bit
+.align	32
+gcm_gmult_4bit:
+	save	%sp,-$frame,%sp
+	ldub	[$Xi+15],$nlo
+	add	$Htbl,8,$Htblo
+
+1:	call	.+8
+	add	%o7,rem_4bit-1b,$rem_4bit
+
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	sll	$nlo,4,$nlo
+	ldx	[$Htblo+$nlo],$Zlo
+	ldx	[$Htbl+$nlo],$Zhi
+
+	ldub	[$Xi+14],$nlo
+
+	ldx	[$Htblo+$nhi],$Tlo
+	and	$Zlo,0xf,$remi
+	ldx	[$Htbl+$nhi],$Thi
+	sll	$remi,3,$remi
+	ldx	[$rem_4bit+$remi],$rem
+	srlx	$Zlo,4,$Zlo
+	mov	13,$cnt
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+
+	and	$Zlo,0xf,$remi
+	and	$nlo,0xf0,$nhi
+	and	$nlo,0x0f,$nlo
+	ba	.Lgmult_inner
+	sll	$nlo,4,$nlo
+.align	32
+.Lgmult_inner:
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	ldub	[$Xi+$cnt],$nlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	srlx	$Zhi,4,$Zhi
+	and	$nlo,0xf0,$nhi
+	addcc	$cnt,-1,$cnt
+	xor	$Zlo,$tmp,$Zlo
+	and	$nlo,0x0f,$nlo
+	xor	$Tlo,$Zlo,$Zlo
+	sll	$nlo,4,$nlo
+	blu	.Lgmult_inner
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nlo],$Tlo
+	sll	$remi,3,$remi
+	xor	$Thi,$Zhi,$Zhi
+	ldx	[$Htbl+$nlo],$Thi
+	srlx	$Zlo,4,$Zlo
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	and	$Zlo,0xf,$remi
+
+	ldx	[$Htblo+$nhi],$Tlo
+	sll	$remi,3,$remi
+	xor	$rem,$Zhi,$Zhi
+	ldx	[$Htbl+$nhi],$Thi
+	srlx	$Zlo,4,$Zlo
+	ldx	[$rem_4bit+$remi],$rem
+	sllx	$Zhi,60,$tmp
+	xor	$Tlo,$Zlo,$Zlo
+	srlx	$Zhi,4,$Zhi
+	xor	$Zlo,$tmp,$Zlo
+	xor	$Thi,$Zhi,$Zhi
+	stx	$Zlo,[$Xi+8]
+	xor	$rem,$Zhi,$Zhi
+	stx	$Zhi,[$Xi]
+
+	ret
+	restore
+.type	gcm_gmult_4bit,#function
+.size	gcm_gmult_4bit,(.-gcm_gmult_4bit)
+.asciz	"GHASH for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl
new file mode 100644
index 0000000000..6b09669d47
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-x86.pl
@@ -0,0 +1,1342 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, May, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two
+# code paths: vanilla x86 and vanilla MMX. Former will be executed on
+# 486 and Pentium, latter on all others. MMX GHASH features so called
+# "528B" variant of "4-bit" method utilizing additional 256+16 bytes
+# of per-key storage [+512 bytes shared table]. Performance results
+# are for streamed GHASH subroutine and are expressed in cycles per
+# processed byte, less is better:
+#
+#		gcc 2.95.3(*)	MMX assembler	x86 assembler
+#
+# Pentium	105/111(**)	-		50
+# PIII		68 /75		12.2		24
+# P4		125/125		17.8		84(***)
+# Opteron	66 /70		10.1		30
+# Core2		54 /67		8.4		18
+#
+# (*)	gcc 3.4.x was observed to generate few percent slower code,
+#	which is one of reasons why 2.95.3 results were chosen,
+#	another reason is lack of 3.4.x results for older CPUs;
+#	comparison with MMX results is not completely fair, because C
+#	results are for vanilla "256B" implementation, while
+#	assembler results are for "528B";-)
+# (**)	second number is result for code compiled with -fPIC flag,
+#	which is actually more relevant, because assembler code is
+#	position-independent;
+# (***)	see comment in non-MMX routine for further details;
+#
+# To summarize, it's >2-5 times faster than gcc-generated code. To
+# anchor it to something else SHA1 assembler processes one byte in
+# 11-13 cycles on contemporary x86 cores. As for choice of MMX in
+# particular, see comment at the end of the file...
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
+# The question is how close is it to theoretical limit? The pclmulqdq
+# instruction latency appears to be 14 cycles and there can't be more
+# than 2 of them executing at any given time. This means that single
+# Karatsuba multiplication would take 28 cycles *plus* few cycles for
+# pre- and post-processing. Then multiplication has to be followed by
+# modulo-reduction. Given that aggregated reduction method [see
+# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
+# white paper by Intel] allows you to perform reduction only once in
+# a while we can assume that asymptotic performance can be estimated
+# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
+# and Naggr is the aggregation factor.
+#
+# Before we proceed to this implementation let's have closer look at
+# the best-performing code suggested by Intel in their white paper.
+# By tracing inter-register dependencies Tmod is estimated as ~19
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
+#
+# Moving on to the implementation in question. Tmod is estimated as
+# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
+# 2.16. How is it possible that measured performance is better than
+# optimistic theoretical estimate? There is one thing Intel failed
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
+#
+# Does it make sense to increase Naggr? To start with it's virtually
+# impossible in 32-bit mode, because of limited register bank
+# capacity. Otherwise improvement has to be weighed agiainst slower
+# setup, as well as code size and complexity increase. As even
+# optimistic estimate doesn't promise 30% performance improvement,
+# there are currently no plans to increase Naggr.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
+$inp  = "edi";
+$Htbl = "esi";
+
+$unroll = 0;	# Affects x86 loop. Folded loop performs ~7% worse
+		# than unrolled, which has to be weighted against
+		# 2.5x x86-specific code size reduction.
+
+sub x86_loop {
+    my $off = shift;
+    my $rem = "eax";
+
+	&mov	($Zhh,&DWP(4,$Htbl,$Zll));
+	&mov	($Zhl,&DWP(0,$Htbl,$Zll));
+	&mov	($Zlh,&DWP(12,$Htbl,$Zll));
+	&mov	($Zll,&DWP(8,$Htbl,$Zll));
+	&xor	($rem,$rem);	# avoid partial register stalls on PIII
+
+	# shrd practically kills P4, 2.5x deterioration, but P4 has
+	# MMX code-path to execute. shrd runs tad faster [than twice
+	# the shifts, move's and or's] on pre-MMX Pentium (as well as
+	# PIII and Core2), *but* minimizes code size, spares register
+	# and thus allows to fold the loop...
+	if (!$unroll) {
+	my $cnt = $inp;
+	&mov	($cnt,15);
+	&jmp	(&label("x86_loop"));
+	&set_label("x86_loop",16);
+	    for($i=1;$i<=2;$i++) {
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		&mov	(&LB($rem),&BP($off,"esp",$cnt));
+		if ($i&1) {
+			&and	(&LB($rem),0xf0);
+		} else {
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+
+		if ($i&1) {
+			&dec	($cnt);
+			&js	(&label("x86_break"));
+		} else {
+			&jmp	(&label("x86_loop"));
+		}
+	    }
+	&set_label("x86_break",16);
+	} else {
+	    for($i=1;$i<32;$i++) {
+		&comment($i);
+		&mov	(&LB($rem),&LB($Zll));
+		&shrd	($Zll,$Zlh,4);
+		&and	(&LB($rem),0xf);
+		&shrd	($Zlh,$Zhl,4);
+		&shrd	($Zhl,$Zhh,4);
+		&shr	($Zhh,4);
+		&xor	($Zhh,&DWP($off+16,"esp",$rem,4));
+
+		if ($i&1) {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&and	(&LB($rem),0xf0);
+		} else {
+			&mov	(&LB($rem),&BP($off+15-($i>>1),"esp"));
+			&shl	(&LB($rem),4);
+		}
+
+		&xor	($Zll,&DWP(8,$Htbl,$rem));
+		&xor	($Zlh,&DWP(12,$Htbl,$rem));
+		&xor	($Zhl,&DWP(0,$Htbl,$rem));
+		&xor	($Zhh,&DWP(4,$Htbl,$rem));
+	    }
+	}
+	&bswap	($Zll);
+	&bswap	($Zlh);
+	&bswap	($Zhl);
+	if (!$x86only) {
+		&bswap	($Zhh);
+	} else {
+		&mov	("eax",$Zhh);
+		&bswap	("eax");
+		&mov	($Zhh,"eax");
+	}
+}
+
+if ($unroll) {
+    &function_begin_B("_x86_gmult_4bit_inner");
+	&x86_loop(4);
+	&ret	();
+    &function_end_B("_x86_gmult_4bit_inner");
+}
+
+sub deposit_rem_4bit {
+    my $bias = shift;
+
+	&mov	(&DWP($bias+0, "esp"),0x0000<<16);
+	&mov	(&DWP($bias+4, "esp"),0x1C20<<16);
+	&mov	(&DWP($bias+8, "esp"),0x3840<<16);
+	&mov	(&DWP($bias+12,"esp"),0x2460<<16);
+	&mov	(&DWP($bias+16,"esp"),0x7080<<16);
+	&mov	(&DWP($bias+20,"esp"),0x6CA0<<16);
+	&mov	(&DWP($bias+24,"esp"),0x48C0<<16);
+	&mov	(&DWP($bias+28,"esp"),0x54E0<<16);
+	&mov	(&DWP($bias+32,"esp"),0xE100<<16);
+	&mov	(&DWP($bias+36,"esp"),0xFD20<<16);
+	&mov	(&DWP($bias+40,"esp"),0xD940<<16);
+	&mov	(&DWP($bias+44,"esp"),0xC560<<16);
+	&mov	(&DWP($bias+48,"esp"),0x9180<<16);
+	&mov	(&DWP($bias+52,"esp"),0x8DA0<<16);
+	&mov	(&DWP($bias+56,"esp"),0xA9C0<<16);
+	&mov	(&DWP($bias+60,"esp"),0xB5E0<<16);
+}
+
+$suffix = $x86only ? "" : "_x86";
+
+&function_begin("gcm_gmult_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for stack alignment
+	&mov	($inp,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+
+	&mov	($Zhh,&DWP(0,$inp));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$inp));
+	&mov	($Zlh,&DWP(8,$inp));
+	&mov	($Zll,&DWP(12,$inp));
+
+	&deposit_rem_4bit(16);
+
+	&mov	(&DWP(0,"esp"),$Zhh);		# copy Xi[16] on stack
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(0));
+	}
+
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_gmult_4bit".$suffix);
+
+&function_begin("gcm_ghash_4bit".$suffix);
+	&stack_push(16+4+1);			# +1 for 64-bit alignment
+	&mov	($Zll,&wparam(0));		# load Xi
+	&mov	($Htbl,&wparam(1));		# load Htable
+	&mov	($inp,&wparam(2));		# load in
+	&mov	("ecx",&wparam(3));		# load len
+	&add	("ecx",$inp);
+	&mov	(&wparam(3),"ecx");
+
+	&mov	($Zhh,&DWP(0,$Zll));		# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zll));
+	&mov	($Zlh,&DWP(8,$Zll));
+	&mov	($Zll,&DWP(12,$Zll));
+
+	&deposit_rem_4bit(16);
+
+    &set_label("x86_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));		# xor with input
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&DWP(12,"esp"),$Zll);		# dump it on stack
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&shr	($Zll,20);
+	&and	($Zll,0xf0);
+
+	if ($unroll) {
+		&call	("_x86_gmult_4bit_inner");
+	} else {
+		&x86_loop(0);
+		&mov	($inp,&wparam(2));
+	}
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&mov	(&wparam(2),$inp)	if (!$unroll);
+	&jb	(&label("x86_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(0,$inp),$Zhh);
+	&stack_pop(16+4+1);
+&function_end("gcm_ghash_4bit".$suffix);
+
+if (!$x86only) {{{
+
+&static_label("rem_4bit");
+
+if (!$sse2) {{	# pure-MMX "May" version...
+
+$S=12;		# shift factor for rem_4bit
+
+&function_begin_B("_mmx_gmult_4bit_inner");
+# MMX version performs 3.5 times better on P4 (see comment in non-MMX
+# routine for further details), 100% better on Opteron, ~70% better
+# on Core2 and PIII... In other words effort is considered to be well
+# spent... Since initial release the loop was unrolled in order to
+# "liberate" register previously used as loop counter. Instead it's
+# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'.
+# The path involves move of Z.lo from MMX to integer register,
+# effective address calculation and finally merge of value to Z.hi.
+# Reference to rem_4bit is scheduled so late that I had to >>4
+# rem_4bit elements. This resulted in 20-45% procent improvement
+# on contemporary �-archs.
+{
+    my $cnt;
+    my $rem_4bit = "eax";
+    my @rem = ($Zhh,$Zll);
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem[0],$Zlo);
+
+	for ($cnt=28;$cnt>=-2;$cnt--) {
+	    my $odd = $cnt&1;
+	    my $nix = $odd ? $nlo : $nhi;
+
+		&shl	(&LB($nlo),4)			if ($odd);
+		&psrlq	($Zlo,4);
+		&movq	($tmp,$Zhi);
+		&psrlq	($Zhi,4);
+		&pxor	($Zlo,&QWP(8,$Htbl,$nix));
+		&mov	(&LB($nlo),&BP($cnt/2,$inp))	if (!$odd && $cnt>=0);
+		&psllq	($tmp,60);
+		&and	($nhi,0xf0)			if ($odd);
+		&pxor	($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28);
+		&and	($rem[0],0xf);
+		&pxor	($Zhi,&QWP(0,$Htbl,$nix));
+		&mov	($nhi,$nlo)			if (!$odd && $cnt>=0);
+		&movd	($rem[1],$Zlo);
+		&pxor	($Zlo,$tmp);
+
+		push	(@rem,shift(@rem));		# "rotate" registers
+	}
+
+	&mov	($inp,&DWP(4,$rem_4bit,$rem[1],8));	# last rem_4bit[rem]
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+	&shl	($inp,4);	# compensate for rem_4bit[i] being >>4
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&xor	($Zhh,$inp);
+	&bswap	($Zhh);
+
+	&ret	();
+}
+&function_end_B("_mmx_gmult_4bit_inner");
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+# Streamed version performs 20% better on P4, 7% on Opteron,
+# 10% on Core2 and PIII...
+&function_begin("gcm_ghash_4bit_mmx");
+	&mov	($Zhh,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+	&mov	($inp,&wparam(2));	# load in
+	&mov	($Zlh,&wparam(3));	# load len
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&add	($Zlh,$inp);
+	&mov	(&wparam(3),$Zlh);	# len to point at the end of input
+	&stack_push(4+1);		# +1 for stack alignment
+
+	&mov	($Zll,&DWP(12,$Zhh));	# load Xi[16]
+	&mov	($Zhl,&DWP(4,$Zhh));
+	&mov	($Zlh,&DWP(8,$Zhh));
+	&mov	($Zhh,&DWP(0,$Zhh));
+	&jmp	(&label("mmx_outer_loop"));
+
+    &set_label("mmx_outer_loop",16);
+	&xor	($Zll,&DWP(12,$inp));
+	&xor	($Zhl,&DWP(4,$inp));
+	&xor	($Zlh,&DWP(8,$inp));
+	&xor	($Zhh,&DWP(0,$inp));
+	&mov	(&wparam(2),$inp);
+	&mov	(&DWP(12,"esp"),$Zll);
+	&mov	(&DWP(4,"esp"),$Zhl);
+	&mov	(&DWP(8,"esp"),$Zlh);
+	&mov	(&DWP(0,"esp"),$Zhh);
+
+	&mov	($inp,"esp");
+	&shr	($Zll,24);
+
+	&call	("_mmx_gmult_4bit_inner");
+
+	&mov	($inp,&wparam(2));
+	&lea	($inp,&DWP(16,$inp));
+	&cmp	($inp,&wparam(3));
+	&jb	(&label("mmx_outer_loop"));
+
+	&mov	($inp,&wparam(0));	# load Xi
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+
+	&stack_pop(4+1);
+&function_end("gcm_ghash_4bit_mmx");
+
+}} else {{	# "June" MMX version...
+		# ... has slower "April" gcm_gmult_4bit_mmx with folded
+		# loop. This is done to conserve code size...
+$S=16;		# shift factor for rem_4bit
+
+sub mmx_loop() {
+# MMX version performs 2.8 times better on P4 (see comment in non-MMX
+# routine for further details), 40% better on Opteron and Core2, 50%
+# better on PIII... In other words effort is considered to be well
+# spent...
+    my $inp = shift;
+    my $rem_4bit = shift;
+    my $cnt = $Zhh;
+    my $nhi = $Zhl;
+    my $nlo = $Zlh;
+    my $rem = $Zll;
+
+    my ($Zlo,$Zhi) = ("mm0","mm1");
+    my $tmp = "mm2";
+
+	&xor	($nlo,$nlo);	# avoid partial register stalls on PIII
+	&mov	($nhi,$Zll);
+	&mov	(&LB($nlo),&LB($nhi));
+	&mov	($cnt,14);
+	&shl	(&LB($nlo),4);
+	&and	($nhi,0xf0);
+	&movq	($Zlo,&QWP(8,$Htbl,$nlo));
+	&movq	($Zhi,&QWP(0,$Htbl,$nlo));
+	&movd	($rem,$Zlo);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_loop",16);
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&mov	(&LB($nlo),&BP(0,$inp,$cnt));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&dec	($cnt);
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&mov	($nhi,$nlo);
+	&pxor	($Zlo,$tmp);
+	&js	(&label("mmx_break"));
+
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+	&jmp	(&label("mmx_loop"));
+
+    &set_label("mmx_break",16);
+	&shl	(&LB($nlo),4);
+	&and	($rem,0xf);
+	&psrlq	($Zlo,4);
+	&and	($nhi,0xf0);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nlo));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nlo));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,4);
+	&and	($rem,0xf);
+	&movq	($tmp,$Zhi);
+	&psrlq	($Zhi,4);
+	&pxor	($Zlo,&QWP(8,$Htbl,$nhi));
+	&psllq	($tmp,60);
+	&pxor	($Zhi,&QWP(0,$rem_4bit,$rem,8));
+	&movd	($rem,$Zlo);
+	&pxor	($Zhi,&QWP(0,$Htbl,$nhi));
+	&pxor	($Zlo,$tmp);
+
+	&psrlq	($Zlo,32);	# lower part of Zlo is already there
+	&movd	($Zhl,$Zhi);
+	&psrlq	($Zhi,32);
+	&movd	($Zlh,$Zlo);
+	&movd	($Zhh,$Zhi);
+
+	&bswap	($Zll);
+	&bswap	($Zhl);
+	&bswap	($Zlh);
+	&bswap	($Zhh);
+}
+
+&function_begin("gcm_gmult_4bit_mmx");
+	&mov	($inp,&wparam(0));	# load Xi
+	&mov	($Htbl,&wparam(1));	# load Htable
+
+	&call	(&label("pic_point"));
+	&set_label("pic_point");
+	&blindpop("eax");
+	&lea	("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
+
+	&movz	($Zll,&BP(15,$inp));
+
+	&mmx_loop($inp,"eax");
+
+	&emms	();
+	&mov	(&DWP(12,$inp),$Zll);
+	&mov	(&DWP(4,$inp),$Zhl);
+	&mov	(&DWP(8,$inp),$Zlh);
+	&mov	(&DWP(0,$inp),$Zhh);
+&function_end("gcm_gmult_4bit_mmx");
+
+######################################################################
+# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
+# (see gcm128.c for details). It provides further 20-40% performance
+# improvement over above mentioned "May" version.
+
+&static_label("rem_8bit");
+
+&function_begin("gcm_ghash_4bit_mmx");
+{ my ($Zlo,$Zhi) = ("mm7","mm6");
+  my $rem_8bit = "esi";
+  my $Htbl = "ebx";
+
+    # parameter block
+    &mov	("eax",&wparam(0));		# Xi
+    &mov	("ebx",&wparam(1));		# Htable
+    &mov	("ecx",&wparam(2));		# inp
+    &mov	("edx",&wparam(3));		# len
+    &mov	("ebp","esp");			# original %esp
+    &call	(&label("pic_point"));
+    &set_label	("pic_point");
+    &blindpop	($rem_8bit);
+    &lea	($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit));
+
+    &sub	("esp",512+16+16);		# allocate stack frame...
+    &and	("esp",-64);			# ...and align it
+    &sub	("esp",16);			# place for (u8)(H[]<<4)
+
+    &add	("edx","ecx");			# pointer to the end of input
+    &mov	(&DWP(528+16+0,"esp"),"eax");	# save Xi
+    &mov	(&DWP(528+16+8,"esp"),"edx");	# save inp+len
+    &mov	(&DWP(528+16+12,"esp"),"ebp");	# save original %esp
+
+    { my @lo  = ("mm0","mm1","mm2");
+      my @hi  = ("mm3","mm4","mm5");
+      my @tmp = ("mm6","mm7");
+      my $off1=0,$off2=0,$i;
+
+      &add	($Htbl,128);			# optimize for size
+      &lea	("edi",&DWP(16+128,"esp"));
+      &lea	("ebp",&DWP(16+256+128,"esp"));
+
+      # decompose Htable (low and high parts are kept separately),
+      # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
+      for ($i=0;$i<18;$i++) {
+
+	&mov	("edx",&DWP(16*$i+8-128,$Htbl))		if ($i<16);
+	&movq	($lo[0],&QWP(16*$i+8-128,$Htbl))	if ($i<16);
+	&psllq	($tmp[1],60)				if ($i>1);
+	&movq	($hi[0],&QWP(16*$i+0-128,$Htbl))	if ($i<16);
+	&por	($lo[2],$tmp[1])			if ($i>1);
+	&movq	(&QWP($off1-128,"edi"),$lo[1])		if ($i>0 && $i<17);
+	&psrlq	($lo[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off1,"edi"),$hi[1])		if ($i>0 && $i<17);
+	&movq	($tmp[0],$hi[1])			if ($i>0 && $i<17);
+	&movq	(&QWP($off2-128,"ebp"),$lo[2])		if ($i>1);
+	&psrlq	($hi[1],4)				if ($i>0 && $i<17);
+	&movq	(&QWP($off2,"ebp"),$hi[2])		if ($i>1);
+	&shl	("edx",4)				if ($i<16);
+	&mov	(&BP($i,"esp"),&LB("edx"))		if ($i<16);
+
+	unshift	(@lo,pop(@lo));			# "rotate" registers
+	unshift	(@hi,pop(@hi));
+	unshift	(@tmp,pop(@tmp));
+	$off1 += 8	if ($i>0);
+	$off2 += 8	if ($i>1);
+      }
+    }
+
+    &movq	($Zhi,&QWP(0,"eax"));
+    &mov	("ebx",&DWP(8,"eax"));
+    &mov	("edx",&DWP(12,"eax"));		# load Xi
+
+&set_label("outer",16);
+  { my $nlo = "eax";
+    my $dat = "edx";
+    my @nhi = ("edi","ebp");
+    my @rem = ("ebx","ecx");
+    my @red = ("mm0","mm1","mm2");
+    my $tmp = "mm3";
+
+    &xor	($dat,&DWP(12,"ecx"));		# merge input data
+    &xor	("ebx",&DWP(8,"ecx"));
+    &pxor	($Zhi,&QWP(0,"ecx"));
+    &lea	("ecx",&DWP(16,"ecx"));		# inp+=16
+    #&mov	(&DWP(528+12,"esp"),$dat);	# save inp^Xi
+    &mov	(&DWP(528+8,"esp"),"ebx");
+    &movq	(&QWP(528+0,"esp"),$Zhi);
+    &mov	(&DWP(528+16+4,"esp"),"ecx");	# save inp
+
+    &xor	($nlo,$nlo);
+    &rol	($dat,8);
+    &mov	(&LB($nlo),&LB($dat));
+    &mov	($nhi[1],$nlo);
+    &and	(&LB($nlo),0x0f);
+    &shr	($nhi[1],4);
+    &pxor	($red[0],$red[0]);
+    &rol	($dat,8);			# next byte
+    &pxor	($red[1],$red[1]);
+    &pxor	($red[2],$red[2]);
+
+    # Just like in "May" verson modulo-schedule for critical path in
+    # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
+    # is scheduled so late that rem_8bit[] has to be shifted *right*
+    # by 16, which is why last argument to pinsrw is 2, which
+    # corresponds to <<32=<<48>>16...
+    for ($j=11,$i=0;$i<15;$i++) {
+
+      if ($i>0) {
+	&pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+	&rol	($dat,8);				# next byte
+	&pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+
+	&pxor	($Zlo,$tmp);
+	&pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+	&xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+      } else {
+	&movq	($Zlo,&QWP(16,"esp",$nlo,8));
+	&movq	($Zhi,&QWP(16+128,"esp",$nlo,8));
+      }
+
+	&mov	(&LB($nlo),&LB($dat));
+	&mov	($dat,&DWP(528+$j,"esp"))		if (--$j%4==0);
+
+	&movd	($rem[0],$Zlo);
+	&movz	($rem[1],&LB($rem[1]))			if ($i>0);
+	&psrlq	($Zlo,8);				# Z>>=8
+
+	&movq	($tmp,$Zhi);
+	&mov	($nhi[0],$nlo);
+	&psrlq	($Zhi,8);
+
+	&pxor	($Zlo,&QWP(16+256+0,"esp",$nhi[1],8));	# Z^=H[nhi]>>4
+	&and	(&LB($nlo),0x0f);
+	&psllq	($tmp,56);
+
+	&pxor	($Zhi,$red[1])				if ($i>1);
+	&shr	($nhi[0],4);
+	&pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2)	if ($i>0);
+
+	unshift	(@red,pop(@red));			# "rotate" registers
+	unshift	(@rem,pop(@rem));
+	unshift	(@nhi,pop(@nhi));
+    }
+
+    &pxor	($Zlo,&QWP(16,"esp",$nlo,8));		# Z^=H[nlo]
+    &pxor	($Zhi,&QWP(16+128,"esp",$nlo,8));
+    &xor	(&LB($rem[1]),&BP(0,"esp",$nhi[0]));	# rem^(H[nhi]<<4)
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
+    &movz	($rem[1],&LB($rem[1]));
+
+    &pxor	($red[2],$red[2]);			# clear 2nd word
+    &psllq	($red[1],4);
+
+    &movd	($rem[0],$Zlo);
+    &psrlq	($Zlo,4);				# Z>>=4
+
+    &movq	($tmp,$Zhi);
+    &psrlq	($Zhi,4);
+    &shl	($rem[0],4);				# rem<<4
+
+    &pxor	($Zlo,&QWP(16,"esp",$nhi[1],8));	# Z^=H[nhi]
+    &psllq	($tmp,60);
+    &movz	($rem[0],&LB($rem[0]));
+
+    &pxor	($Zlo,$tmp);
+    &pxor	($Zhi,&QWP(16+128,"esp",$nhi[1],8));
+
+    &pinsrw	($red[0],&WP(0,$rem_8bit,$rem[1],2),2);
+    &pxor	($Zhi,$red[1]);
+
+    &movd	($dat,$Zlo);
+    &pinsrw	($red[2],&WP(0,$rem_8bit,$rem[0],2),3);	# last is <<48
+
+    &psllq	($red[0],12);				# correct by <<16>>4
+    &pxor	($Zhi,$red[0]);
+    &psrlq	($Zlo,32);
+    &pxor	($Zhi,$red[2]);
+
+    &mov	("ecx",&DWP(528+16+4,"esp"));	# restore inp
+    &movd	("ebx",$Zlo);
+    &movq	($tmp,$Zhi);			# 01234567
+    &psllw	($Zhi,8);			# 1.3.5.7.
+    &psrlw	($tmp,8);			# .0.2.4.6
+    &por	($Zhi,$tmp);			# 10325476
+    &bswap	($dat);
+    &pshufw	($Zhi,$Zhi,0b00011011);		# 76543210
+    &bswap	("ebx");
+    
+    &cmp	("ecx",&DWP(528+16+8,"esp"));	# are we done?
+    &jne	(&label("outer"));
+  }
+
+    &mov	("eax",&DWP(528+16+0,"esp"));	# restore Xi
+    &mov	(&DWP(12,"eax"),"edx");
+    &mov	(&DWP(8,"eax"),"ebx");
+    &movq	(&QWP(0,"eax"),$Zhi);
+
+    &mov	("esp",&DWP(528+16+12,"esp"));	# restore original %esp
+    &emms	();
+}
+&function_end("gcm_ghash_4bit_mmx");
+}}
+
+if ($sse2) {{
+######################################################################
+# PCLMULQDQ version.
+
+$Xip="eax";
+$Htbl="edx";
+$const="ecx";
+$inp="esi";
+$len="ebx";
+
+($Xi,$Xhi)=("xmm0","xmm1");	$Hkey="xmm2";
+($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
+($Xn,$Xhn)=("xmm6","xmm7");
+
+&static_label("bswap");
+
+sub clmul64x64_T2 {	# minimal "register" pressure
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($Xhi,$Xi);		#
+	&pshufd		($T1,$Xi,0b01001110);
+	&pshufd		($T2,$Hkey,0b01001110);
+	&pxor		($T1,$Xi);		#
+	&pxor		($T2,$Hkey);
+
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pclmulqdq	($T1,$T2,0x00);		#######
+	&xorps		($T1,$Xi);		#
+	&xorps		($T1,$Xhi);		#
+
+	&movdqa		($T2,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T2,8);		#
+	&pxor		($Xhi,$T1);
+	&pxor		($Xi,$T2);		#
+}
+
+sub clmul64x64_T3 {
+# Even though this subroutine offers visually better ILP, it
+# was empirically found to be a tad slower than above version.
+# At least in gcm_ghash_clmul context. But it's just as well,
+# because loop modulo-scheduling is possible only thanks to
+# minimized "register" pressure...
+my ($Xhi,$Xi,$Hkey)=@_;
+
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($Xhi,$Xi);
+	&pclmulqdq	($Xi,$Hkey,0x00);	#######
+	&pclmulqdq	($Xhi,$Hkey,0x11);	#######
+	&pshufd		($T2,$T1,0b01001110);	#
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T2,$T1);		#
+	&pxor		($T3,$Hkey);
+	&pclmulqdq	($T2,$T3,0x00);		#######
+	&pxor		($T2,$Xi);		#
+	&pxor		($T2,$Xhi);		#
+
+	&movdqa		($T3,$T2);		#
+	&psrldq		($T2,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+}
+
+if (1) {		# Algorithm 9 with <<1 twist.
+			# Reduction is shorter and uses only two
+			# temporary registers, which makes it better
+			# candidate for interleaving with 64x64
+			# multiplication. Pre-modulo-scheduled loop
+			# was found to be ~20% faster than Algorithm 5
+			# below. Algorithm 9 was therefore chosen for
+			# further optimization...
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+	# 1st phase
+	&movdqa		($T1,$Xi)		#
+	&psllq		($Xi,1);
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,5);		#
+	&pxor		($Xi,$T1);		#
+	&psllq		($Xi,57);		#
+	&movdqa		($T2,$Xi);		#
+	&pslldq		($Xi,8);
+	&psrldq		($T2,8);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);		#
+
+	# 2nd phase
+	&movdqa		($T2,$Xi);
+	&psrlq		($Xi,5);
+	&pxor		($Xi,$T2);		#
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+	&pxor		($T2,$Xhi);
+	&psrlq		($Xi,1);		#
+	&pxor		($Xi,$T2);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# <<1 twist
+	&pshufd		($T2,$Hkey,0b11111111);	# broadcast uppermost dword
+	&movdqa		($T1,$Hkey);
+	&psllq		($Hkey,1);
+	&pxor		($T3,$T3);		#
+	&psrlq		($T1,63);
+	&pcmpgtd	($T3,$T2);		# broadcast carry bit
+	&pslldq		($T1,8);
+	&por		($Hkey,$T1);		# H<<=1
+
+	# magic reduction
+	&pand		($T3,&QWP(16,$const));	# 0x1c2_polynomial
+	&pxor		($Hkey,$T3);		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movups		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&sub		($len,0x20);
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+
+	&movdqa		($T3,$Xn);		#&clmul64x64_TX	($Xhn,$Xn,$Hkey); H*Ii+1
+	&movdqa		($Xhn,$Xn);
+	 &pxor		($Xhi,$T1);		# "Ii+Xi", consume early
+
+	  &movdqa	($T1,$Xi)		#&reduction_alg9($Xhi,$Xi); 1st phase
+	  &psllq	($Xi,1);
+	  &pxor		($Xi,$T1);		#
+	  &psllq	($Xi,5);		#
+	  &pxor		($Xi,$T1);		#
+	&pclmulqdq	($Xn,$Hkey,0x00);	#######
+	  &psllq	($Xi,57);		#
+	  &movdqa	($T2,$Xi);		#
+	  &pslldq	($Xi,8);
+	  &psrldq	($T2,8);		#	
+	  &pxor		($Xi,$T1);
+	&pshufd		($T1,$T3,0b01001110);
+	  &pxor		($Xhi,$T2);		#
+	&pxor		($T1,$T3);
+	&pshufd		($T3,$Hkey,0b01001110);
+	&pxor		($T3,$Hkey);		#
+
+	&pclmulqdq	($Xhn,$Hkey,0x11);	#######
+	  &movdqa	($T2,$Xi);		# 2nd phase
+	  &psrlq	($Xi,5);
+	  &pxor		($Xi,$T2);		#
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+	  &pxor		($T2,$Xhi);
+	  &psrlq	($Xi,1);		#
+	  &pxor		($Xi,$T2);		#
+
+	&pclmulqdq	($T1,$T3,0x00);		#######
+	&movups		($Hkey,&QWP(16,$Htbl));	# load H^2
+	&xorps		($T1,$Xn);		#
+	&xorps		($T1,$Xhn);		#
+
+	&movdqa		($T3,$T1);		#
+	&psrldq		($T1,8);
+	&pslldq		($T3,8);		#
+	&pxor		($Xhn,$T1);
+	&pxor		($Xn,$T3);		#
+	&movdqa		($T3,&QWP(0,$const));
+
+	&lea		($inp,&DWP(32,$inp));
+	&sub		($len,0x20);
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg9	($Xhi,$Xi);
+
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movups		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+} else {		# Algorith 5. Kept for reference purposes.
+
+sub reduction_alg5 {	# 19/16 times faster than Intel version
+my ($Xhi,$Xi)=@_;
+
+	# <<1
+	&movdqa		($T1,$Xi);		#
+	&movdqa		($T2,$Xhi);
+	&pslld		($Xi,1);
+	&pslld		($Xhi,1);		#
+	&psrld		($T1,31);
+	&psrld		($T2,31);		#
+	&movdqa		($T3,$T1);
+	&pslldq		($T1,4);
+	&psrldq		($T3,12);		#
+	&pslldq		($T2,4);
+	&por		($Xhi,$T3);		#
+	&por		($Xi,$T1);
+	&por		($Xhi,$T2);		#
+
+	# 1st phase
+	&movdqa		($T1,$Xi);
+	&movdqa		($T2,$Xi);
+	&movdqa		($T3,$Xi);		#
+	&pslld		($T1,31);
+	&pslld		($T2,30);
+	&pslld		($Xi,25);		#
+	&pxor		($T1,$T2);
+	&pxor		($T1,$Xi);		#
+	&movdqa		($T2,$T1);		#
+	&pslldq		($T1,12);
+	&psrldq		($T2,4);		#
+	&pxor		($T3,$T1);
+
+	# 2nd phase
+	&pxor		($Xhi,$T3);		#
+	&movdqa		($Xi,$T3);
+	&movdqa		($T1,$T3);
+	&psrld		($Xi,1);		#
+	&psrld		($T1,2);
+	&psrld		($T3,7);		#
+	&pxor		($Xi,$T1);
+	&pxor		($Xhi,$T2);
+	&pxor		($Xi,$T3);		#
+	&pxor		($Xi,$Xhi);		#
+}
+
+&function_begin_B("gcm_init_clmul");
+	&mov		($Htbl,&wparam(0));
+	&mov		($Xip,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Hkey,&QWP(0,$Xip));
+	&pshufd		($Hkey,$Hkey,0b01001110);# dword swap
+
+	# calculate H^2
+	&movdqa		($Xi,$Hkey);
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqu		(&QWP(0,$Htbl),$Hkey);	# save H
+	&movdqu		(&QWP(16,$Htbl),$Xi);	# save H^2
+
+	&ret		();
+&function_end_B("gcm_init_clmul");
+
+&function_begin_B("gcm_gmult_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($Xn,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$Xn);
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);
+	&reduction_alg5	($Xhi,$Xi);
+
+	&pshufb		($Xi,$Xn);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+
+	&ret	();
+&function_end_B("gcm_gmult_clmul");
+
+&function_begin("gcm_ghash_clmul");
+	&mov		($Xip,&wparam(0));
+	&mov		($Htbl,&wparam(1));
+	&mov		($inp,&wparam(2));
+	&mov		($len,&wparam(3));
+
+	&call		(&label("pic"));
+&set_label("pic");
+	&blindpop	($const);
+	&lea		($const,&DWP(&label("bswap")."-".&label("pic"),$const));
+
+	&movdqu		($Xi,&QWP(0,$Xip));
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($Hkey,&QWP(0,$Htbl));
+	&pshufb		($Xi,$T3);
+
+	&sub		($len,0x10);
+	&jz		(&label("odd_tail"));
+
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));	# i+=2
+	&jbe		(&label("even_tail"));
+
+&set_label("mod_loop");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	#######
+	&movdqa		($T3,&QWP(0,$const));
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&movdqu		($Xn,&QWP(16,$inp));	# Ii+1
+	&pshufb		($T1,$T3);
+	&pshufb		($Xn,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhn,$Xn,$Hkey);	# H*Ii+1
+	&movdqu		($Hkey,&QWP(16,$Htbl));	# load H^2
+
+	&sub		($len,0x20);
+	&lea		($inp,&DWP(32,$inp));
+	&ja		(&label("mod_loop"));
+
+&set_label("even_tail");
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H^2*(Ii+Xi)
+
+	&pxor		($Xi,$Xn);		# (H*Ii+1) + H^2*(Ii+Xi)
+	&pxor		($Xhi,$Xhn);
+
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+	&test		($len,$len);
+	&jnz		(&label("done"));
+
+	&movdqu		($Hkey,&QWP(0,$Htbl));	# load H
+&set_label("odd_tail");
+	&movdqu		($T1,&QWP(0,$inp));	# Ii
+	&pshufb		($T1,$T3);
+	&pxor		($Xi,$T1);		# Ii+Xi
+
+	&clmul64x64_T3	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg5	($Xhi,$Xi);
+
+	&movdqa		($T3,&QWP(0,$const));
+&set_label("done");
+	&pshufb		($Xi,$T3);
+	&movdqu		(&QWP(0,$Xip),$Xi);
+&function_end("gcm_ghash_clmul");
+
+}
+
+&set_label("bswap",64);
+	&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
+	&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2);	# 0x1c2_polynomial
+}}	# $sse2
+
+&set_label("rem_4bit",64);
+	&data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S);
+	&data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S);
+	&data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S);
+	&data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S);
+&set_label("rem_8bit",64);
+	&data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E);
+	&data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E);
+	&data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E);
+	&data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E);
+	&data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E);
+	&data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E);
+	&data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E);
+	&data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E);
+	&data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE);
+	&data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE);
+	&data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE);
+	&data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE);
+	&data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E);
+	&data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E);
+	&data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE);
+	&data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE);
+	&data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E);
+	&data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E);
+	&data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E);
+	&data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E);
+	&data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E);
+	&data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E);
+	&data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E);
+	&data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E);
+	&data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE);
+	&data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE);
+	&data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE);
+	&data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE);
+	&data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E);
+	&data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E);
+	&data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE);
+	&data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE);
+}}}	# !$x86only
+
+&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
+&asm_finish();
+
+# A question was risen about choice of vanilla MMX. Or rather why wasn't
+# SSE2 chosen instead? In addition to the fact that MMX runs on legacy
+# CPUs such as PIII, "4-bit" MMX version was observed to provide better
+# performance than *corresponding* SSE2 one even on contemporary CPUs.
+# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2
+# implementation featuring full range of lookup-table sizes, but with
+# per-invocation lookup table setup. Latter means that table size is
+# chosen depending on how much data is to be hashed in every given call,
+# more data - larger table. Best reported result for Core2 is ~4 cycles
+# per processed byte out of 64KB block. This number accounts even for
+# 64KB table setup overhead. As discussed in gcm128.c we choose to be
+# more conservative in respect to lookup table sizes, but how do the
+# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
+# on same platform. As also discussed in gcm128.c, next in line "8-bit
+# Shoup's" or "4KB" method should deliver twice the performance of
+# "256B" one, in other words not worse than ~6 cycles per byte. It
+# should be also be noted that in SSE2 case improvement can be "super-
+# linear," i.e. more than twice, mostly because >>8 maps to single
+# instruction on SSE2 register. This is unlike "4-bit" case when >>4
+# maps to same amount of instructions in both MMX and SSE2 cases.
+# Bottom line is that switch to SSE2 is considered to be justifiable
+# only in case we choose to implement "8-bit" method...
diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
new file mode 100644
index 0000000000..a5ae180882
--- /dev/null
+++ b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl
@@ -0,0 +1,805 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse <dwmw2@infradead.org> for
+# providing access to a Westmere-based system on behalf of Intel
+# Open Source Technology Centre.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# common register layout
+$nlo="%rax";
+$nhi="%rbx";
+$Zlo="%r8";
+$Zhi="%r9";
+$tmp="%r10";
+$rem_4bit = "%r11";
+
+$Xi="%rdi";
+$Htbl="%rsi";
+
+# per-function register layout
+$cnt="%rcx";
+$rem="%rdx";
+
+sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/	or
+			$r =~ s/%[er]([sd]i)/%\1l/	or
+			$r =~ s/%[er](bp)/%\1l/		or
+			$r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+{ my $N;
+  sub loop() {
+  my $inp = shift;
+
+	$N++;
+$code.=<<___;
+	xor	$nlo,$nlo
+	xor	$nhi,$nhi
+	mov	`&LB("$Zlo")`,`&LB("$nlo")`
+	mov	`&LB("$Zlo")`,`&LB("$nhi")`
+	shl	\$4,`&LB("$nlo")`
+	mov	\$14,$cnt
+	mov	8($Htbl,$nlo),$Zlo
+	mov	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	mov	$Zlo,$rem
+	jmp	.Loop$N
+
+.align	16
+.Loop$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	mov	($inp,$cnt),`&LB("$nlo")`
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	mov	`&LB("$nlo")`,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	shl	\$4,`&LB("$nlo")`
+	xor	$tmp,$Zlo
+	dec	$cnt
+	js	.Lbreak$N
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+	jmp	.Loop$N
+
+.align	16
+.Lbreak$N:
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nlo),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nlo),$Zhi
+	and	\$0xf0,`&LB("$nhi")`
+	xor	($rem_4bit,$rem,8),$Zhi
+	mov	$Zlo,$rem
+	xor	$tmp,$Zlo
+
+	shr	\$4,$Zlo
+	and	\$0xf,$rem
+	mov	$Zhi,$tmp
+	shr	\$4,$Zhi
+	xor	8($Htbl,$nhi),$Zlo
+	shl	\$60,$tmp
+	xor	($Htbl,$nhi),$Zhi
+	xor	$tmp,$Zlo
+	xor	($rem_4bit,$rem,8),$Zhi
+
+	bswap	$Zlo
+	bswap	$Zhi
+___
+}}
+
+$code=<<___;
+.text
+
+.globl	gcm_gmult_4bit
+.type	gcm_gmult_4bit,\@function,2
+.align	16
+gcm_gmult_4bit:
+	push	%rbx
+	push	%rbp		# %rbp and %r12 are pushed exclusively in
+	push	%r12		# order to reuse Win64 exception handler...
+.Lgmult_prologue:
+
+	movzb	15($Xi),$Zlo
+	lea	.Lrem_4bit(%rip),$rem_4bit
+___
+	&loop	($Xi);
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	mov	16(%rsp),%rbx
+	lea	24(%rsp),%rsp
+.Lgmult_epilogue:
+	ret
+.size	gcm_gmult_4bit,.-gcm_gmult_4bit
+___
+
+# per-function register layout
+$inp="%rdx";
+$len="%rcx";
+$rem_8bit=$rem_4bit;
+
+$code.=<<___;
+.globl	gcm_ghash_4bit
+.type	gcm_ghash_4bit,\@function,4
+.align	16
+gcm_ghash_4bit:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$280,%rsp
+.Lghash_prologue:
+	mov	$inp,%r14		# reassign couple of args
+	mov	$len,%r15
+___
+{ my $inp="%r14";
+  my $dat="%edx";
+  my $len="%r15";
+  my @nhi=("%ebx","%ecx");
+  my @rem=("%r12","%r13");
+  my $Hshr4="%rbp";
+
+	&sub	($Htbl,-128);		# size optimization
+	&lea	($Hshr4,"16+128(%rsp)");
+	{ my @lo =($nlo,$nhi);
+          my @hi =($Zlo,$Zhi);
+
+	  &xor	($dat,$dat);
+	  for ($i=0,$j=-2;$i<18;$i++,$j++) {
+	    &mov	("$j(%rsp)",&LB($dat))		if ($i>1);
+	    &or		($lo[0],$tmp)			if ($i>1);
+	    &mov	(&LB($dat),&LB($lo[1]))		if ($i>0 && $i<17);
+	    &shr	($lo[1],4)			if ($i>0 && $i<17);
+	    &mov	($tmp,$hi[1])			if ($i>0 && $i<17);
+	    &shr	($hi[1],4)			if ($i>0 && $i<17);
+	    &mov	("8*$j($Hshr4)",$hi[0])		if ($i>1);
+	    &mov	($hi[0],"16*$i+0-128($Htbl)")	if ($i<16);
+	    &shl	(&LB($dat),4)			if ($i>0 && $i<17);
+	    &mov	("8*$j-128($Hshr4)",$lo[0])	if ($i>1);
+	    &mov	($lo[0],"16*$i+8-128($Htbl)")	if ($i<16);
+	    &shl	($tmp,60)			if ($i>0 && $i<17);
+
+	    push	(@lo,shift(@lo));
+	    push	(@hi,shift(@hi));
+	  }
+	}
+	&add	($Htbl,-128);
+	&mov	($Zlo,"8($Xi)");
+	&mov	($Zhi,"0($Xi)");
+	&add	($len,$inp);		# pointer to the end of data
+	&lea	($rem_8bit,".Lrem_8bit(%rip)");
+	&jmp	(".Louter_loop");
+
+$code.=".align	16\n.Louter_loop:\n";
+	&xor	($Zhi,"($inp)");
+	&mov	("%rdx","8($inp)");
+	&lea	($inp,"16($inp)");
+	&xor	("%rdx",$Zlo);
+	&mov	("($Xi)",$Zhi);
+	&mov	("8($Xi)","%rdx");
+	&shr	("%rdx",32);
+
+	&xor	($nlo,$nlo);
+	&rol	($dat,8);
+	&mov	(&LB($nlo),&LB($dat));
+	&movz	($nhi[0],&LB($dat));
+	&shl	(&LB($nlo),4);
+	&shr	($nhi[0],4);
+
+	for ($j=11,$i=0;$i<15;$i++) {
+	    &rol	($dat,8);
+	    &xor	($Zlo,"8($Htbl,$nlo)")			if ($i>0);
+	    &xor	($Zhi,"($Htbl,$nlo)")			if ($i>0);
+	    &mov	($Zlo,"8($Htbl,$nlo)")			if ($i==0);
+	    &mov	($Zhi,"($Htbl,$nlo)")			if ($i==0);
+
+	    &mov	(&LB($nlo),&LB($dat));
+	    &xor	($Zlo,$tmp)				if ($i>0);
+	    &movzw	($rem[1],"($rem_8bit,$rem[1],2)")	if ($i>0);
+
+	    &movz	($nhi[1],&LB($dat));
+	    &shl	(&LB($nlo),4);
+	    &movzb	($rem[0],"(%rsp,$nhi[0])");
+
+	    &shr	($nhi[1],4)				if ($i<14);
+	    &and	($nhi[1],0xf0)				if ($i==14);
+	    &shl	($rem[1],48)				if ($i>0);
+	    &xor	($rem[0],$Zlo);
+
+	    &mov	($tmp,$Zhi);
+	    &xor	($Zhi,$rem[1])				if ($i>0);
+	    &shr	($Zlo,8);
+
+	    &movz	($rem[0],&LB($rem[0]));
+	    &mov	($dat,"$j($Xi)")			if (--$j%4==0);
+	    &shr	($Zhi,8);
+
+	    &xor	($Zlo,"-128($Hshr4,$nhi[0],8)");
+	    &shl	($tmp,56);
+	    &xor	($Zhi,"($Hshr4,$nhi[0],8)");
+
+	    unshift	(@nhi,pop(@nhi));		# "rotate" registers
+	    unshift	(@rem,pop(@rem));
+	}
+	&movzw	($rem[1],"($rem_8bit,$rem[1],2)");
+	&xor	($Zlo,"8($Htbl,$nlo)");
+	&xor	($Zhi,"($Htbl,$nlo)");
+
+	&shl	($rem[1],48);
+	&xor	($Zlo,$tmp);
+
+	&xor	($Zhi,$rem[1]);
+	&movz	($rem[0],&LB($Zlo));
+	&shr	($Zlo,4);
+
+	&mov	($tmp,$Zhi);
+	&shl	(&LB($rem[0]),4);
+	&shr	($Zhi,4);
+
+	&xor	($Zlo,"8($Htbl,$nhi[0])");
+	&movzw	($rem[0],"($rem_8bit,$rem[0],2)");
+	&shl	($tmp,60);
+
+	&xor	($Zhi,"($Htbl,$nhi[0])");
+	&xor	($Zlo,$tmp);
+	&shl	($rem[0],48);
+
+	&bswap	($Zlo);
+	&xor	($Zhi,$rem[0]);
+
+	&bswap	($Zhi);
+	&cmp	($inp,$len);
+	&jb	(".Louter_loop");
+}
+$code.=<<___;
+	mov	$Zlo,8($Xi)
+	mov	$Zhi,($Xi)
+
+	lea	280(%rsp),%rsi
+	mov	0(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lghash_epilogue:
+	ret
+.size	gcm_ghash_4bit,.-gcm_ghash_4bit
+___
+
+######################################################################
+# PCLMULQDQ version.
+
+@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+		("%rdi","%rsi","%rdx","%rcx");	# Unix order
+
+($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
+($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
+
+sub clmul64x64_T2 {	# minimal register pressure
+my ($Xhi,$Xi,$Hkey,$modulo)=@_;
+
+$code.=<<___ if (!defined($modulo));
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey,$T2
+___
+$code.=<<___;
+	pclmulqdq	\$0x00,$Hkey,$Xi	#######
+	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
+	pclmulqdq	\$0x00,$T2,$T1		#######
+	pxor		$Xi,$T1			#
+	pxor		$Xhi,$T1		#
+
+	movdqa		$T1,$T2			#
+	psrldq		\$8,$T1
+	pslldq		\$8,$T2			#
+	pxor		$T1,$Xhi
+	pxor		$T2,$Xi			#
+___
+}
+
+sub reduction_alg9 {	# 17/13 times faster than Intel version
+my ($Xhi,$Xi) = @_;
+
+$code.=<<___;
+	# 1st phase
+	movdqa		$Xi,$T1			#
+	psllq		\$1,$Xi
+	pxor		$T1,$Xi			#
+	psllq		\$5,$Xi			#
+	pxor		$T1,$Xi			#
+	psllq		\$57,$Xi		#
+	movdqa		$Xi,$T2			#
+	pslldq		\$8,$Xi
+	psrldq		\$8,$T2			#	
+	pxor		$T1,$Xi
+	pxor		$T2,$Xhi		#
+
+	# 2nd phase
+	movdqa		$Xi,$T2
+	psrlq		\$5,$Xi
+	pxor		$T2,$Xi			#
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+	pxor		$Xhi,$T2
+	psrlq		\$1,$Xi			#
+	pxor		$T2,$Xi			#
+___
+}
+
+{ my ($Htbl,$Xip)=@_4args;
+
+$code.=<<___;
+.globl	gcm_init_clmul
+.type	gcm_init_clmul,\@abi-omnipotent
+.align	16
+gcm_init_clmul:
+	movdqu		($Xip),$Hkey
+	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
+
+	# <<1 twist
+	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
+	movdqa		$Hkey,$T1
+	psllq		\$1,$Hkey
+	pxor		$T3,$T3			#
+	psrlq		\$63,$T1
+	pcmpgtd		$T2,$T3			# broadcast carry bit
+	pslldq		\$8,$T1
+	por		$T1,$Hkey		# H<<=1
+
+	# magic reduction
+	pand		.L0x1c2_polynomial(%rip),$T3
+	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
+
+	# calculate H^2
+	movdqa		$Hkey,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	movdqu		$Hkey,($Htbl)		# save H
+	movdqu		$Xi,16($Htbl)		# save H^2
+	ret
+.size	gcm_init_clmul,.-gcm_init_clmul
+___
+}
+
+{ my ($Xip,$Htbl)=@_4args;
+
+$code.=<<___;
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,\@abi-omnipotent
+.align	16
+gcm_gmult_clmul:
+	movdqu		($Xip),$Xi
+	movdqa		.Lbswap_mask(%rip),$T3
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+	ret
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+___
+}
+
+{ my ($Xip,$Htbl,$inp,$len)=@_4args;
+  my $Xn="%xmm6";
+  my $Xhn="%xmm7";
+  my $Hkey2="%xmm8";
+  my $T1n="%xmm9";
+  my $T2n="%xmm10";
+
+$code.=<<___;
+.globl	gcm_ghash_clmul
+.type	gcm_ghash_clmul,\@abi-omnipotent
+.align	16
+gcm_ghash_clmul:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_gcm_ghash_clmul:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x58		#sub	\$0x58,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+	.byte	0x44,0x0f,0x29,0x44,0x24,0x20	#movaps	%xmm8,0x20(%rsp)
+	.byte	0x44,0x0f,0x29,0x4c,0x24,0x30	#movaps	%xmm9,0x30(%rsp)
+	.byte	0x44,0x0f,0x29,0x54,0x24,0x40	#movaps	%xmm10,0x40(%rsp)
+___
+$code.=<<___;
+	movdqa		.Lbswap_mask(%rip),$T3
+
+	movdqu		($Xip),$Xi
+	movdqu		($Htbl),$Hkey
+	pshufb		$T3,$Xi
+
+	sub		\$0x10,$len
+	jz		.Lodd_tail
+
+	movdqu		16($Htbl),$Hkey2
+	#######
+	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
+	#	[(H*Ii+1) + (H*Xi+1)] mod P =
+	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
+	#
+	movdqu		($inp),$T1		# Ii
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhn,$Xn,$Hkey);	# H*Ii+1
+$code.=<<___;
+	movdqa		$Xi,$Xhi		#
+	pshufd		\$0b01001110,$Xi,$T1
+	pshufd		\$0b01001110,$Hkey2,$T2
+	pxor		$Xi,$T1			#
+	pxor		$Hkey2,$T2
+
+	lea		32($inp),$inp		# i+=2
+	sub		\$0x20,$len
+	jbe		.Leven_tail
+
+.Lmod_loop:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	movdqu		($inp),$T1		# Ii
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+
+	movdqu		16($inp),$Xn		# Ii+1
+	pshufb		$T3,$T1
+	pshufb		$T3,$Xn
+
+	movdqa		$Xn,$Xhn		#
+	pshufd		\$0b01001110,$Xn,$T1n
+	pshufd		\$0b01001110,$Hkey,$T2n
+	pxor		$Xn,$T1n		#
+	pxor		$Hkey,$T2n
+	 pxor		$T1,$Xhi		# "Ii+Xi", consume early
+
+	  movdqa	$Xi,$T1			# 1st phase
+	  psllq		\$1,$Xi
+	  pxor		$T1,$Xi			#
+	  psllq		\$5,$Xi			#
+	  pxor		$T1,$Xi			#
+	pclmulqdq	\$0x00,$Hkey,$Xn	#######
+	  psllq		\$57,$Xi		#
+	  movdqa	$Xi,$T2			#
+	  pslldq	\$8,$Xi
+	  psrldq	\$8,$T2			#	
+	  pxor		$T1,$Xi
+	  pxor		$T2,$Xhi		#
+
+	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
+	  movdqa	$Xi,$T2			# 2nd phase
+	  psrlq		\$5,$Xi
+	  pxor		$T2,$Xi			#
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+	  pxor		$Xhi,$T2
+	  psrlq		\$1,$Xi			#
+	  pxor		$T2,$Xi			#
+
+	pclmulqdq	\$0x00,$T2n,$T1n	#######
+	 movdqa		$Xi,$Xhi		#
+	 pshufd		\$0b01001110,$Xi,$T1
+	 pshufd		\$0b01001110,$Hkey2,$T2
+	 pxor		$Xi,$T1			#
+	 pxor		$Hkey2,$T2
+
+	pxor		$Xn,$T1n		#
+	pxor		$Xhn,$T1n		#
+	movdqa		$T1n,$T2n		#
+	psrldq		\$8,$T1n
+	pslldq		\$8,$T2n		#
+	pxor		$T1n,$Xhn
+	pxor		$T2n,$Xn		#
+
+	lea		32($inp),$inp
+	sub		\$0x20,$len
+	ja		.Lmod_loop
+
+.Leven_tail:
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey2,1);	# H^2*(Ii+Xi)
+$code.=<<___;
+	pxor		$Xn,$Xi			# (H*Ii+1) + H^2*(Ii+Xi)
+	pxor		$Xhn,$Xhi
+___
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+	test		$len,$len
+	jnz		.Ldone
+
+.Lodd_tail:
+	movdqu		($inp),$T1		# Ii
+	pshufb		$T3,$T1
+	pxor		$T1,$Xi			# Ii+Xi
+___
+	&clmul64x64_T2	($Xhi,$Xi,$Hkey);	# H*(Ii+Xi)
+	&reduction_alg9	($Xhi,$Xi);
+$code.=<<___;
+.Ldone:
+	pshufb		$T3,$Xi
+	movdqu		$Xi,($Xip)
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	add	\$0x58,%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_gcm_ghash_clmul:
+.size	gcm_ghash_clmul,.-gcm_ghash_clmul
+___
+}
+
+$code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.align	64
+.type	.Lrem_4bit,\@object
+.Lrem_4bit:
+	.long	0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
+	.long	0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
+	.long	0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
+	.long	0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
+.type	.Lrem_8bit,\@object
+.Lrem_8bit:
+	.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+	.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+	.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+	.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+	.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+	.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+	.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+	.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+	.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+	.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+	.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+	.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+	.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+	.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+	.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+	.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+	.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+	.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+	.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+	.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+	.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+	.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+	.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+	.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+	.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+	.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+	.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+	.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+	.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+	.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+	.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+	.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	lea	24(%rax),%rax		# adjust "rsp"
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$`1232/8`,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_gcm_gmult_4bit
+	.rva	.LSEH_end_gcm_gmult_4bit
+	.rva	.LSEH_info_gcm_gmult_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_4bit
+	.rva	.LSEH_end_gcm_ghash_4bit
+	.rva	.LSEH_info_gcm_ghash_4bit
+
+	.rva	.LSEH_begin_gcm_ghash_clmul
+	.rva	.LSEH_end_gcm_ghash_clmul
+	.rva	.LSEH_info_gcm_ghash_clmul
+
+.section	.xdata
+.align	8
+.LSEH_info_gcm_gmult_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lgmult_prologue,.Lgmult_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_4bit:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lghash_prologue,.Lghash_epilogue	# HandlerData
+.LSEH_info_gcm_ghash_clmul:
+	.byte	0x01,0x1f,0x0b,0x00
+	.byte	0x1f,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x19,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x0d,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x08,0x68,0x00,0x00	#movaps (rsp),xmm6
+	.byte	0x04,0xa2,0x00,0x00	#sub	rsp,0x58
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c
index 8f8bd563b9..3d3782cbe1 100644
--- a/src/lib/libcrypto/modes/cbc128.c
+++ b/src/lib/libcrypto/modes/cbc128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,12 +59,7 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT 1
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
+#ifndef STRICT_ALIGNMENT
 #  define STRICT_ALIGNMENT 0
 #endif
 
diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c
new file mode 100644
index 0000000000..c9b35e5b35
--- /dev/null
+++ b/src/lib/libcrypto/modes/ccm128.c
@@ -0,0 +1,441 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+/* First you setup M and L parameters and pass the key schedule.
+ * This is called once per session setup... */
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+	unsigned int M,unsigned int L,void *key,block128_f block)
+{
+	memset(ctx->nonce.c,0,sizeof(ctx->nonce.c));
+	ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3;
+	ctx->blocks = 0;
+	ctx->block = block;
+	ctx->key = key;
+}
+
+/* !!! Following interfaces are to be called *once* per packet !!! */
+
+/* Then you setup per-message nonce and pass the length of the message */
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+	const unsigned char *nonce,size_t nlen,size_t mlen)
+{
+	unsigned int L = ctx->nonce.c[0]&7;	/* the L parameter */
+
+	if (nlen<(14-L)) return -1;		/* nonce is too short */
+
+	if (sizeof(mlen)==8 && L>=3) {
+		ctx->nonce.c[8]  = (u8)(mlen>>(56%(sizeof(mlen)*8)));
+		ctx->nonce.c[9]  = (u8)(mlen>>(48%(sizeof(mlen)*8)));
+		ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8)));
+		ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8)));
+	}
+	else
+		*(u32*)(&ctx->nonce.c[8]) = 0;
+
+	ctx->nonce.c[12] = (u8)(mlen>>24);
+	ctx->nonce.c[13] = (u8)(mlen>>16);
+	ctx->nonce.c[14] = (u8)(mlen>>8);
+	ctx->nonce.c[15] = (u8)mlen;
+
+	ctx->nonce.c[0] &= ~0x40;	/* clear Adata flag */
+	memcpy(&ctx->nonce.c[1],nonce,14-L);
+
+	return 0;
+}
+
+/* Then you pass additional authentication data, this is optional */
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+	const unsigned char *aad,size_t alen)
+{	unsigned int i;
+	block128_f block = ctx->block;
+
+	if (alen==0) return;
+
+	ctx->nonce.c[0] |= 0x40;	/* set Adata flag */
+	(*block)(ctx->nonce.c,ctx->cmac.c,ctx->key),
+	ctx->blocks++;
+
+	if (alen<(0x10000-0x100)) {
+		ctx->cmac.c[0] ^= (u8)(alen>>8);
+		ctx->cmac.c[1] ^= (u8)alen;
+		i=2;
+	}
+	else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFF;
+		ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8)));
+		ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8)));
+		ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8)));
+		ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8)));
+		ctx->cmac.c[6] ^= (u8)(alen>>24);
+		ctx->cmac.c[7] ^= (u8)(alen>>16);
+		ctx->cmac.c[8] ^= (u8)(alen>>8);
+		ctx->cmac.c[9] ^= (u8)alen;
+		i=10;
+	}
+	else {
+		ctx->cmac.c[0] ^= 0xFF;
+		ctx->cmac.c[1] ^= 0xFE;
+		ctx->cmac.c[2] ^= (u8)(alen>>24);
+		ctx->cmac.c[3] ^= (u8)(alen>>16);
+		ctx->cmac.c[4] ^= (u8)(alen>>8);
+		ctx->cmac.c[5] ^= (u8)alen;
+		i=6;
+	}
+
+	do {
+		for(;i<16 && alen;++i,++aad,--alen)
+			ctx->cmac.c[i] ^= *aad;
+		(*block)(ctx->cmac.c,ctx->cmac.c,ctx->key),
+		ctx->blocks++;
+		i=0;
+	} while (alen);
+}
+
+/* Finally you encrypt or decrypt the message */
+
+/* counter part of nonce may not be larger than L*8 bits,
+ * L is not larger than 8, therefore 64-bit counter... */
+static void ctr64_inc(unsigned char *counter) {
+	unsigned int n=8;
+	u8  c;
+
+	counter += 8;
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= temp.u[0];
+		ctx->cmac.u[1] ^= temp.u[1];
+#else
+		ctx->cmac.u[0] ^= ((u64*)inp)[0];
+		ctx->cmac.u[1] ^= ((u64*)inp)[1];
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		temp.u[0] ^= scratch.u[0];
+		temp.u[1] ^= scratch.u[1];
+		memcpy(out,temp.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0];
+		((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		union { u64 u[2]; u8 c[16]; } temp;
+#endif
+		(*block)(ctx->nonce.c,scratch.c,key);
+		ctr64_inc(ctx->nonce.c);
+#if defined(STRICT_ALIGNMENT)
+		memcpy (temp.c,inp,16);
+		ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]);
+		ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]);
+		memcpy (out,scratch.c,16);
+#else
+		ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]);
+		ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]);
+#endif
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+
+		inp += 16;
+		out += 16;
+		len -= 16;
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+static void ctr64_add (unsigned char *counter,size_t inc)
+{	size_t n=8, val=0;
+
+	counter += 8;
+	do {
+		--n;
+		val += counter[n] + (inc&0xff);
+		counter[n] = (unsigned char)val;
+		val >>= 8;	/* carry bit */
+		inc >>= 8;
+	} while(n && (inc || val));
+}
+
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key),
+		ctx->blocks++;
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;	/* length mismatch */
+
+	ctx->blocks += ((len+15)>>3)|1;
+	if (ctx->blocks > (U64(1)<<61))	return -2; /* too much data */
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i) out[i] = scratch.c[i]^inp[i];
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out,
+	size_t len,ccm128_f stream)
+{
+	size_t		n;
+	unsigned int	i,L;
+	unsigned char	flags0	= ctx->nonce.c[0];
+	block128_f	block	= ctx->block;
+	void *		key	= ctx->key;
+	union { u64 u[2]; u8 c[16]; } scratch;
+
+	if (!(flags0&0x40))
+		(*block)(ctx->nonce.c,ctx->cmac.c,key);
+
+	ctx->nonce.c[0] = L = flags0&7;
+	for (n=0,i=15-L;i<15;++i) {
+		n |= ctx->nonce.c[i];
+		ctx->nonce.c[i]=0;
+		n <<= 8;
+	}
+	n |= ctx->nonce.c[15];	/* reconstructed length */
+	ctx->nonce.c[15]=1;
+
+	if (n!=len) return -1;
+
+	if ((n=len/16)) {
+		(*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c);
+		n   *= 16;
+		inp += n;
+		out += n;
+		len -= n;
+		if (len) ctr64_add(ctx->nonce.c,n/16);
+	}
+
+	if (len) {
+		(*block)(ctx->nonce.c,scratch.c,key);
+		for (i=0; i<len; ++i)
+			ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);
+		(*block)(ctx->cmac.c,ctx->cmac.c,key);
+	}
+
+	for (i=15-L;i<16;++i)
+		ctx->nonce.c[i]=0;
+
+	(*block)(ctx->nonce.c,scratch.c,key);
+	ctx->cmac.u[0] ^= scratch.u[0];
+	ctx->cmac.u[1] ^= scratch.u[1];
+
+	ctx->nonce.c[0] = flags0;
+
+	return 0;
+}
+
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len)
+{	unsigned int M = (ctx->nonce.c[0]>>3)&7;	/* the M parameter */
+
+	M *= 2; M += 2;
+	if (len<M)	return 0;
+	memcpy(tag,ctx->cmac.c,M);
+	return M;
+}
diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c
index e5938c6137..4e6f5d35e1 100644
--- a/src/lib/libcrypto/modes/cfb128.c
+++ b/src/lib/libcrypto/modes/cfb128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit cfb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c
index 932037f551..ee642c5863 100644
--- a/src/lib/libcrypto/modes/ctr128.c
+++ b/src/lib/libcrypto/modes/ctr128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,17 +59,6 @@
 #endif
 #include <assert.h>
 
-typedef unsigned int u32;
-typedef unsigned char u8;
-
-#define STRICT_ALIGNMENT
-#if defined(__i386)	|| defined(__i386__)	|| \
-    defined(__x86_64)	|| defined(__x86_64__)	|| \
-    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
-    defined(__s390__)	|| defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* NOTE: the IV/counter CTR mode is big-endian.  The code itself
  * is endian-neutral. */
 
@@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
 
 	*num=n;
 }
+
+/* increment upper 96 bits of 128-bit counter by 1 */
+static void ctr96_inc(unsigned char *counter) {
+	u32 n=12;
+	u8  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], unsigned char ecount_buf[16],
+			unsigned int *num, ctr128_f func)
+{
+	unsigned int n,ctr32;
+
+	assert(in && out && key && ecount_buf && num);
+	assert(*num < 16);
+
+	n = *num;
+
+	while (n && len) {
+		*(out++) = *(in++) ^ ecount_buf[n];
+		--len;
+		n = (n+1) % 16;
+	}
+
+	ctr32 = GETU32(ivec+12);
+	while (len>=16) {
+		size_t blocks = len/16;
+		/*
+		 * 1<<28 is just a not-so-small yet not-so-large number...
+		 * Below condition is practically never met, but it has to
+		 * be checked for code correctness.
+		 */
+		if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28))
+			blocks = (1U<<28);
+		/*
+		 * As (*func) operates on 32-bit counter, caller
+		 * has to handle overflow. 'if' below detects the
+		 * overflow, which is then handled by limiting the
+		 * amount of blocks to the exact overflow point...
+		 */
+		ctr32 += (u32)blocks;
+		if (ctr32 < blocks) {
+			blocks -= ctr32;
+			ctr32   = 0;
+		}
+		(*func)(in,out,blocks,key,ivec);
+		/* (*ctr) does not update ivec, caller does: */
+		PUTU32(ivec+12,ctr32);
+		/* ... overflow was detected, propogate carry. */
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		blocks *= 16;
+		len -= blocks;
+		out += blocks;
+		in  += blocks;
+	}
+	if (len) {
+		memset(ecount_buf,0,16);
+		(*func)(ecount_buf,ecount_buf,1,key,ivec);
+		++ctr32;
+		PUTU32(ivec+12,ctr32);
+		if (ctr32 == 0)	ctr96_inc(ivec);
+		while (len--) {
+			out[n] = in[n] ^ ecount_buf[n];
+			++n;
+		}
+	}
+
+	*num=n;
+}
diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c
index e0430f9fdc..c0e1f3696c 100644
--- a/src/lib/libcrypto/modes/cts128.c
+++ b/src/lib/libcrypto/modes/cts128.c
@@ -5,7 +5,8 @@
  * forms are granted according to the OpenSSL license.
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -23,8 +24,9 @@
  * deviates from mentioned RFCs. Most notably it allows input to be
  * of block length and it doesn't flip the order of the last two
  * blocks. CTS is being discussed even in ECB context, but it's not
- * adopted for any known application. This implementation complies
- * with mentioned RFCs and [as such] extends CBC mode.
+ * adopted for any known application. This implementation provides
+ * two interfaces: one compliant with above mentioned RFCs and one
+ * compliant with the NIST proposal, both extending CBC mode.
  */
 
 size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
@@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out,
 	return len+residue;
 }
 
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block)
+{	size_t residue, n;
+
+	assert (in && out && key && ivec);
+
+	if (len < 16) return 0;
+
+	residue=len%16;
+
+	len -= residue;
+
+	CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block);
+
+	if (residue==0)	return len;
+
+	in  += len;
+	out += len;
+
+	for (n=0; n<residue; ++n)
+		ivec[n] ^= in[n];
+	(*block)(ivec,ivec,key);
+	memcpy(out-16+residue,ivec,16);
+
+	return len+residue;
+}
+
 size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], cbc128_f cbc)
@@ -90,6 +120,41 @@ size_t CRYPTO_cts128_encrypt(const unsigned char *in, unsigned char *out,
 	return len+residue;
 }
 
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc)
+{	size_t residue;
+	union { size_t align; unsigned char c[16]; } tmp;
+
+	assert (in && out && key && ivec);
+
+	if (len < 16) return 0;
+
+	residue=len%16;
+
+	len -= residue;
+
+	(*cbc)(in,out,len,key,ivec,1);
+
+	if (residue==0) return len;
+
+	in  += len;
+	out += len;
+
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+	(*cbc)(in,out-16+residue,residue,key,ivec,1);
+#else
+	{
+	size_t n;
+	for (n=0; n<16; n+=sizeof(size_t))
+		*(size_t *)(tmp.c+n) = 0;
+	memcpy(tmp.c,in,residue);
+	}
+	(*cbc)(tmp.c,out-16+residue,16,key,ivec,1);
+#endif
+	return len+residue;
+}
+
 size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], block128_f block)
@@ -125,7 +190,51 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
 	for(residue+=16; n<residue; ++n)
 		out[n] = tmp.c[n] ^ in[n];
 
-	return len+residue-16;
+	return 16+len+residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block)
+{	size_t residue, n;
+	union { size_t align; unsigned char c[32]; } tmp;
+
+	assert (in && out && key && ivec);
+
+	if (len<16) return 0;
+
+	residue=len%16;
+
+	if (residue==0) {
+		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
+		return len;
+	}
+
+	len -= 16+residue;
+
+	if (len) {
+		CRYPTO_cbc128_decrypt(in,out,len,key,ivec,block);
+		in  += len;
+		out += len;
+	}
+
+	(*block)(in+residue,tmp.c+16,key);
+
+	for (n=0; n<16; n+=sizeof(size_t))
+		*(size_t *)(tmp.c+n) = *(size_t *)(tmp.c+16+n);
+	memcpy(tmp.c,in,residue);
+	(*block)(tmp.c,tmp.c,key);
+
+	for(n=0; n<16; ++n) {
+		unsigned char c = in[n];
+		out[n] = tmp.c[n] ^ ivec[n];
+		ivec[n] = in[n+residue];
+		tmp.c[n] = c;
+	}
+	for(residue+=16; n<residue; ++n)
+		out[n] = tmp.c[n] ^ tmp.c[n-16];
+
+	return 16+len+residue;
 }
 
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
@@ -160,7 +269,47 @@ size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
 	(*cbc)(tmp.c,tmp.c,32,key,ivec,0);
 	memcpy(out,tmp.c,16+residue);
 #endif
-	return len+residue;
+	return 16+len+residue;
+}
+
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc)
+{	size_t residue, n;
+	union { size_t align; unsigned char c[32]; } tmp;
+
+	assert (in && out && key && ivec);
+
+	if (len<16) return 0;
+
+	residue=len%16;
+
+	if (residue==0) {
+		(*cbc)(in,out,len,key,ivec,0);
+		return len;
+	}
+
+	len -= 16+residue;
+
+	if (len) {
+		(*cbc)(in,out,len,key,ivec,0);
+		in  += len;
+		out += len;
+	}
+
+	for (n=16; n<32; n+=sizeof(size_t))
+		*(size_t *)(tmp.c+n) = 0;
+	/* this places in[16] at &tmp.c[16] and decrypted block at &tmp.c[0] */
+	(*cbc)(in+residue,tmp.c,16,key,tmp.c+16,0);
+
+	memcpy(tmp.c,in,residue);
+#if defined(CBC_HANDLES_TRUNCATED_IO)
+	(*cbc)(tmp.c,out,16+residue,key,ivec,0);
+#else
+	(*cbc)(tmp.c,tmp.c,32,key,ivec,0);
+	memcpy(out,tmp.c,16+residue);
+#endif
+	return 16+len+residue;
 }
 
 #if defined(SELFTEST)
@@ -200,9 +349,8 @@ static const unsigned char vector_64[64] =
 static AES_KEY encks, decks;
 
 void test_vector(const unsigned char *vector,size_t len)
-{	unsigned char cleartext[64];
-	unsigned char iv[sizeof(test_iv)];
-	unsigned char ciphertext[64];
+{	unsigned char iv[sizeof(test_iv)];
+	unsigned char cleartext[64],ciphertext[64];
 	size_t tail;
 
 	printf("vector_%d\n",len); fflush(stdout);
@@ -243,7 +391,57 @@ void test_vector(const unsigned char *vector,size_t len)
 		fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
 }
 
-main()
+void test_nistvector(const unsigned char *vector,size_t len)
+{	unsigned char iv[sizeof(test_iv)];
+	unsigned char cleartext[64],ciphertext[64],nistvector[64];
+	size_t tail;
+
+	printf("nistvector_%d\n",len); fflush(stdout);
+
+	if ((tail=len%16) == 0) tail = 16;
+
+	len -= 16 + tail;
+	memcpy(nistvector,vector,len);
+	/* flip two last blocks */
+	memcpy(nistvector+len,vector+len+16,tail);
+	memcpy(nistvector+len+tail,vector+len,16);
+	len += 16 + tail;
+	tail = 16;
+
+	/* test block-based encryption */
+	memcpy(iv,test_iv,sizeof(test_iv));
+	CRYPTO_nistcts128_encrypt_block(test_input,ciphertext,len,&encks,iv,(block128_f)AES_encrypt);
+	if (memcmp(ciphertext,nistvector,len))
+		fprintf(stderr,"output_%d mismatch\n",len), exit(1);
+	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+		fprintf(stderr,"iv_%d mismatch\n",len), exit(1);
+
+	/* test block-based decryption */
+	memcpy(iv,test_iv,sizeof(test_iv));
+	CRYPTO_nistcts128_decrypt_block(ciphertext,cleartext,len,&decks,iv,(block128_f)AES_decrypt);
+	if (memcmp(cleartext,test_input,len))
+		fprintf(stderr,"input_%d mismatch\n",len), exit(2);
+	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+		fprintf(stderr,"iv_%d mismatch\n",len), exit(2);
+
+	/* test streamed encryption */
+	memcpy(iv,test_iv,sizeof(test_iv));
+	CRYPTO_nistcts128_encrypt(test_input,ciphertext,len,&encks,iv,(cbc128_f)AES_cbc_encrypt);
+	if (memcmp(ciphertext,nistvector,len))
+		fprintf(stderr,"output_%d mismatch\n",len), exit(3);
+	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+		fprintf(stderr,"iv_%d mismatch\n",len), exit(3);
+
+	/* test streamed decryption */
+	memcpy(iv,test_iv,sizeof(test_iv));
+	CRYPTO_nistcts128_decrypt(ciphertext,cleartext,len,&decks,iv,(cbc128_f)AES_cbc_encrypt);
+	if (memcmp(cleartext,test_input,len))
+		fprintf(stderr,"input_%d mismatch\n",len), exit(4);
+	if (memcmp(iv,nistvector+len-tail,sizeof(iv)))
+		fprintf(stderr,"iv_%d mismatch\n",len), exit(4);
+}
+
+int main()
 {
 	AES_set_encrypt_key(test_key,128,&encks);
 	AES_set_decrypt_key(test_key,128,&decks);
@@ -254,6 +452,14 @@ main()
 	test_vector(vector_47,sizeof(vector_47));
 	test_vector(vector_48,sizeof(vector_48));
 	test_vector(vector_64,sizeof(vector_64));
-	exit(0);
+
+	test_nistvector(vector_17,sizeof(vector_17));
+	test_nistvector(vector_31,sizeof(vector_31));
+	test_nistvector(vector_32,sizeof(vector_32));
+	test_nistvector(vector_47,sizeof(vector_47));
+	test_nistvector(vector_48,sizeof(vector_48));
+	test_nistvector(vector_64,sizeof(vector_64));
+
+	return 0;
 }
 #endif
diff --git a/src/lib/libcrypto/modes/gcm128.c b/src/lib/libcrypto/modes/gcm128.c
new file mode 100644
index 0000000000..7d6d034970
--- /dev/null
+++ b/src/lib/libcrypto/modes/gcm128.c
@@ -0,0 +1,1757 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#define OPENSSL_FIPSAPI
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+#if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
+/* redefine, because alignment is ensured */
+#undef	GETU32
+#define	GETU32(p)	BSWAP4(*(const u32 *)(p))
+#undef	PUTU32
+#define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#endif
+
+#define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
+#define REDUCE1BIT(V)	do { \
+	if (sizeof(size_t)==8) { \
+		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^T; \
+	} \
+	else { \
+		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
+		V.lo  = (V.hi<<63)|(V.lo>>1); \
+		V.hi  = (V.hi>>1 )^((u64)T<<32); \
+	} \
+} while(0)
+
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ *   handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ *   subsystem penalties (for example on Windows large enough free
+ *   results in VM working set trimming, meaning that consequent
+ *   malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ *   performance of other code paths (not necessarily even from same
+ *   thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
+#if	TABLE_BITS==8
+
+static void gcm_init_8bit(u128 Htable[256], u64 H[2])
+{
+	int  i, j;
+	u128 V;
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+	for (Htable[128]=V, i=64; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<256; i<<=1) {
+		u128 *Hi = Htable+i, H0 = *Hi;
+		for (j=1; j<i; ++j) {
+			Hi[j].hi = H0.hi^Htable[j].hi;
+			Hi[j].lo = H0.lo^Htable[j].lo;
+		}
+	}
+}
+
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
+{
+	u128 Z = { 0, 0};
+	const u8 *xi = (const u8 *)Xi+15;
+	size_t rem, n = *xi;
+	const union { long one; char little; } is_endian = {1};
+	static const size_t rem_8bit[256] = {
+		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
+		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
+		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
+		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
+		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
+		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
+		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
+		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
+		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
+		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
+		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
+		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
+		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
+		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
+		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
+		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
+		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
+		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
+		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
+		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
+		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
+		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
+		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
+		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
+		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
+		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
+		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
+		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
+		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
+		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
+		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
+		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
+		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
+		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
+		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
+		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
+		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
+		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
+		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
+		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
+		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
+		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
+		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
+		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
+		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
+		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
+		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
+		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
+		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
+		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
+		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
+		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
+		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
+		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
+		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
+		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
+		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
+		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
+		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
+		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
+		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
+		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
+		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
+		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
+
+	while (1) {
+		Z.hi ^= Htable[n].hi;
+		Z.lo ^= Htable[n].lo;
+
+		if ((u8 *)Xi==xi)	break;
+
+		n = *(--xi);
+
+		rem  = (size_t)Z.lo&0xff;
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_8bit[rem];
+		else
+			Z.hi ^= (u64)rem_8bit[rem]<<32;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
+
+#elif	TABLE_BITS==4
+
+static void gcm_init_4bit(u128 Htable[16], u64 H[2])
+{
+	u128 V;
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	int  i;
+#endif
+
+	Htable[0].hi = 0;
+	Htable[0].lo = 0;
+	V.hi = H[0];
+	V.lo = H[1];
+
+#if defined(OPENSSL_SMALL_FOOTPRINT)
+	for (Htable[8]=V, i=4; i>0; i>>=1) {
+		REDUCE1BIT(V);
+		Htable[i] = V;
+	}
+
+	for (i=2; i<16; i<<=1) {
+		u128 *Hi = Htable+i;
+		int   j;
+		for (V=*Hi, j=1; j<i; ++j) {
+			Hi[j].hi = V.hi^Htable[j].hi;
+			Hi[j].lo = V.lo^Htable[j].lo;
+		}
+	}
+#else
+	Htable[8] = V;
+	REDUCE1BIT(V);
+	Htable[4] = V;
+	REDUCE1BIT(V);
+	Htable[2] = V;
+	REDUCE1BIT(V);
+	Htable[1] = V;
+	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
+	V=Htable[4];
+	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
+	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
+	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
+	V=Htable[8];
+	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
+	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
+	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
+	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
+	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
+	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
+	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
+#endif
+#if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
+	/*
+	 * ARM assembler expects specific dword order in Htable.
+	 */
+	{
+	int j;
+	const union { long one; char little; } is_endian = {1};
+
+	if (is_endian.little)
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo;
+			Htable[j].lo = V.hi;
+		}
+	else
+		for (j=0;j<16;++j) {
+			V = Htable[j];
+			Htable[j].hi = V.lo<<32|V.lo>>32;
+			Htable[j].lo = V.hi<<32|V.hi>>32;
+		}
+	}
+#endif
+}
+
+#ifndef GHASH_ASM
+static const size_t rem_4bit[16] = {
+	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
+	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
+	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
+	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
+
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
+{
+	u128 Z;
+	int cnt = 15;
+	size_t rem, nlo, nhi;
+	const union { long one; char little; } is_endian = {1};
+
+	nlo  = ((const u8 *)Xi)[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+/*
+ * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
+ * details... Compiler-generated code doesn't seem to give any
+ * performance improvement, at least not on x86[_64]. It's here
+ * mostly as reference and a placeholder for possible future
+ * non-trivial optimization[s]...
+ */
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)
+{
+    u128 Z;
+    int cnt;
+    size_t rem, nlo, nhi;
+    const union { long one; char little; } is_endian = {1};
+
+#if 1
+    do {
+	cnt  = 15;
+	nlo  = ((const u8 *)Xi)[15];
+	nlo ^= inp[15];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi = Htable[nlo].hi;
+	Z.lo = Htable[nlo].lo;
+
+	while (1) {
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nhi].hi;
+		Z.lo ^= Htable[nhi].lo;
+
+		if (--cnt<0)		break;
+
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		rem  = (size_t)Z.lo&0xf;
+		Z.lo = (Z.hi<<60)|(Z.lo>>4);
+		Z.hi = (Z.hi>>4);
+		if (sizeof(size_t)==8)
+			Z.hi ^= rem_4bit[rem];
+		else
+			Z.hi ^= (u64)rem_4bit[rem]<<32;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+	}
+#else
+    /*
+     * Extra 256+16 bytes per-key plus 512 bytes shared tables
+     * [should] give ~50% improvement... One could have PACK()-ed
+     * the rem_8bit even here, but the priority is to minimize
+     * cache footprint...
+     */ 
+    u128 Hshr4[16];	/* Htable shifted right by 4 bits */
+    u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
+    static const unsigned short rem_8bit[256] = {
+	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
+	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
+	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
+	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
+	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
+	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
+	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
+	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
+	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
+	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
+	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
+	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
+	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
+	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
+	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
+	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
+	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
+	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
+	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
+	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
+	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
+	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
+	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
+	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
+	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
+	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
+	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
+	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
+	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
+	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
+	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
+	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
+    /*
+     * This pre-processing phase slows down procedure by approximately
+     * same time as it makes each loop spin faster. In other words
+     * single block performance is approximately same as straightforward
+     * "4-bit" implementation, and then it goes only faster...
+     */
+    for (cnt=0; cnt<16; ++cnt) {
+	Z.hi = Htable[cnt].hi;
+	Z.lo = Htable[cnt].lo;
+	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
+	Hshr4[cnt].hi = (Z.hi>>4);
+	Hshl4[cnt]    = (u8)(Z.lo<<4);
+    }
+
+    do {
+	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
+		nlo  = ((const u8 *)Xi)[cnt];
+		nlo ^= inp[cnt];
+		nhi  = nlo>>4;
+		nlo &= 0xf;
+
+		Z.hi ^= Htable[nlo].hi;
+		Z.lo ^= Htable[nlo].lo;
+
+		rem = (size_t)Z.lo&0xff;
+
+		Z.lo = (Z.hi<<56)|(Z.lo>>8);
+		Z.hi = (Z.hi>>8);
+
+		Z.hi ^= Hshr4[nhi].hi;
+		Z.lo ^= Hshr4[nhi].lo;
+		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
+	}
+
+	nlo  = ((const u8 *)Xi)[0];
+	nlo ^= inp[0];
+	nhi  = nlo>>4;
+	nlo &= 0xf;
+
+	Z.hi ^= Htable[nlo].hi;
+	Z.lo ^= Htable[nlo].lo;
+
+	rem = (size_t)Z.lo&0xf;
+
+	Z.lo = (Z.hi<<60)|(Z.lo>>4);
+	Z.hi = (Z.hi>>4);
+
+	Z.hi ^= Htable[nhi].hi;
+	Z.lo ^= Htable[nhi].lo;
+	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
+#endif
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+    } while (inp+=16, len-=16);
+}
+#endif
+#else
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+
+#define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
+#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
+#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
+/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
+ * trashing effect. In other words idea is to hash data while it's
+ * still in L1 cache after encryption pass... */
+#define GHASH_CHUNK       (3*1024)
+#endif
+
+#else	/* TABLE_BITS */
+
+static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
+{
+	u128 V,Z = { 0,0 };
+	long X;
+	int  i,j;
+	const long *xi = (const long *)Xi;
+	const union { long one; char little; } is_endian = {1};
+
+	V.hi = H[0];	/* H is in host byte order, no byte swapping */
+	V.lo = H[1];
+
+	for (j=0; j<16/sizeof(long); ++j) {
+		if (is_endian.little) {
+			if (sizeof(long)==8) {
+#ifdef BSWAP8
+				X = (long)(BSWAP8(xi[j]));
+#else
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
+#endif
+			}
+			else {
+				const u8 *p = (const u8 *)(xi+j);
+				X = (long)GETU32(p);
+			}
+		}
+		else
+			X = xi[j];
+
+		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
+			u64 M = (u64)(X>>(8*sizeof(long)-1));
+			Z.hi ^= V.hi&M;
+			Z.lo ^= V.lo&M;
+
+			REDUCE1BIT(V);
+		}
+	}
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		Xi[0] = BSWAP8(Z.hi);
+		Xi[1] = BSWAP8(Z.lo);
+#else
+		u8 *p = (u8 *)Xi;
+		u32 v;
+		v = (u32)(Z.hi>>32);	PUTU32(p,v);
+		v = (u32)(Z.hi);	PUTU32(p+4,v);
+		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
+		v = (u32)(Z.lo);	PUTU32(p+12,v);
+#endif
+	}
+	else {
+		Xi[0] = Z.hi;
+		Xi[1] = Z.lo;
+	}
+}
+#define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
+
+#endif
+
+#if	TABLE_BITS==4 && defined(GHASH_ASM)
+# if	!defined(I386_ONLY) && \
+	(defined(__i386)	|| defined(__i386__)	|| \
+	 defined(__x86_64)	|| defined(__x86_64__)	|| \
+	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
+#  define GHASH_ASM_X86_OR_64
+#  define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_ia32cap_P[2];
+
+void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+#  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#   define GHASH_ASM_X86
+void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+
+void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# elif defined(__arm__) || defined(__arm)
+#  include "arm_arch.h"
+#  if __ARM_ARCH__>=7
+#   define GHASH_ASM_ARM
+#   define GCM_FUNCREF_4BIT
+void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# endif
+#endif
+
+#ifdef GCM_FUNCREF_4BIT
+# undef  GCM_MUL
+# define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+#  undef  GHASH
+#  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
+#endif
+
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
+{
+	const union { long one; char little; } is_endian = {1};
+
+	memset(ctx,0,sizeof(*ctx));
+	ctx->block = block;
+	ctx->key   = key;
+
+	(*block)(ctx->H.c,ctx->H.c,key);
+
+	if (is_endian.little) {
+		/* H is stored in host byte order */
+#ifdef BSWAP8
+		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
+		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
+#else
+		u8 *p = ctx->H.c;
+		u64 hi,lo;
+		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
+		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
+		ctx->H.u[0] = hi;
+		ctx->H.u[1] = lo;
+#endif
+	}
+
+#if	TABLE_BITS==8
+	gcm_init_8bit(ctx->Htable,ctx->H.u);
+#elif	TABLE_BITS==4
+# if	defined(GHASH_ASM_X86_OR_64)
+#  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
+	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
+		gcm_init_clmul(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_clmul;
+		ctx->ghash = gcm_ghash_clmul;
+		return;
+	}
+#  endif
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+#  if	defined(GHASH_ASM_X86)			/* x86 only */
+#   if defined(OPENSSL_IA32_SSE2)
+	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
+#   else
+	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
+#   endif
+		ctx->gmult = gcm_gmult_4bit_mmx;
+		ctx->ghash = gcm_ghash_4bit_mmx;
+	} else {
+		ctx->gmult = gcm_gmult_4bit_x86;
+		ctx->ghash = gcm_ghash_4bit_x86;
+	}
+#  else
+	ctx->gmult = gcm_gmult_4bit;
+	ctx->ghash = gcm_ghash_4bit;
+#  endif
+# elif	defined(GHASH_ASM_ARM)
+	if (OPENSSL_armcap_P & ARMV7_NEON) {
+		ctx->gmult = gcm_gmult_neon;
+		ctx->ghash = gcm_ghash_neon;
+	} else {
+		gcm_init_4bit(ctx->Htable,ctx->H.u);
+		ctx->gmult = gcm_gmult_4bit;
+		ctx->ghash = gcm_ghash_4bit;
+	}
+# else
+	gcm_init_4bit(ctx->Htable,ctx->H.u);
+# endif
+#endif
+}
+
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	ctx->Yi.u[0]  = 0;
+	ctx->Yi.u[1]  = 0;
+	ctx->Xi.u[0]  = 0;
+	ctx->Xi.u[1]  = 0;
+	ctx->len.u[0] = 0;	/* AAD length */
+	ctx->len.u[1] = 0;	/* message length */
+	ctx->ares = 0;
+	ctx->mres = 0;
+
+	if (len==12) {
+		memcpy(ctx->Yi.c,iv,12);
+		ctx->Yi.c[15]=1;
+		ctr=1;
+	}
+	else {
+		size_t i;
+		u64 len0 = len;
+
+		while (len>=16) {
+			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+			iv += 16;
+			len -= 16;
+		}
+		if (len) {
+			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
+			GCM_MUL(ctx,Yi);
+		}
+		len0 <<= 3;
+		if (is_endian.little) {
+#ifdef BSWAP8
+			ctx->Yi.u[1]  ^= BSWAP8(len0);
+#else
+			ctx->Yi.c[8]  ^= (u8)(len0>>56);
+			ctx->Yi.c[9]  ^= (u8)(len0>>48);
+			ctx->Yi.c[10] ^= (u8)(len0>>40);
+			ctx->Yi.c[11] ^= (u8)(len0>>32);
+			ctx->Yi.c[12] ^= (u8)(len0>>24);
+			ctx->Yi.c[13] ^= (u8)(len0>>16);
+			ctx->Yi.c[14] ^= (u8)(len0>>8);
+			ctx->Yi.c[15] ^= (u8)(len0);
+#endif
+		}
+		else
+			ctx->Yi.u[1]  ^= len0;
+
+		GCM_MUL(ctx,Yi);
+
+		if (is_endian.little)
+			ctr = GETU32(ctx->Yi.c+12);
+		else
+			ctr = ctx->Yi.d[3];
+	}
+
+	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
+	++ctr;
+	if (is_endian.little)
+		PUTU32(ctx->Yi.c+12,ctr);
+	else
+		ctx->Yi.d[3] = ctr;
+}
+
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
+{
+	size_t i;
+	unsigned int n;
+	u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	if (ctx->len.u[1]) return -2;
+
+	alen += len;
+	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
+		return -1;
+	ctx->len.u[0] = alen;
+
+	n = ctx->ares;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(aad++);
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->ares = n;
+			return 0;
+		}
+	}
+
+#ifdef GHASH
+	if ((i = (len&(size_t)-16))) {
+		GHASH(ctx,aad,i);
+		aad += i;
+		len -= i;
+	}
+#else
+	while (len>=16) {
+		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
+		GCM_MUL(ctx,Xi);
+		aad += 16;
+		len -= 16;
+	}
+#endif
+	if (len) {
+		n = (unsigned int)len;
+		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
+	}
+
+	ctx->ares = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+#if 0
+	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
+#endif
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL(ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    while (j) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    size_t j=i;
+
+		    while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		    GHASH(ctx,out-j,j);
+		}
+#else
+		while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(ctx->Xi.c+i) ^=
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64        mlen  = ctx->len.u[1];
+	block128_f block = ctx->block;
+	void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+#if !defined(OPENSSL_SMALL_FOOTPRINT)
+	if (16%sizeof(size_t) == 0) do {	/* always true actually */
+		if (n) {
+			while (n && len) {
+				u8 c = *(in++);
+				*(out++) = c^ctx->EKi.c[n];
+				ctx->Xi.c[n] ^= c;
+				--len;
+				n = (n+1)%16;
+			}
+			if (n==0) GCM_MUL (ctx,Xi);
+			else {
+				ctx->mres = n;
+				return 0;
+			}
+		}
+#if defined(STRICT_ALIGNMENT)
+		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
+			break;
+#endif
+#if defined(GHASH) && defined(GHASH_CHUNK)
+		while (len>=GHASH_CHUNK) {
+		    size_t j=GHASH_CHUNK;
+
+		    GHASH(ctx,in,GHASH_CHUNK);
+		    while (j) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			j   -= 16;
+		    }
+		    len -= GHASH_CHUNK;
+		}
+		if ((i = (len&(size_t)-16))) {
+		    GHASH(ctx,in,i);
+		    while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t))
+				*(size_t *)(out+i) =
+				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		    }
+		}
+#else
+		while (len>=16) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			for (i=0; i<16; i+=sizeof(size_t)) {
+				size_t c = *(size_t *)(in+i);
+				*(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
+				*(size_t *)(ctx->Xi.c+i) ^= c;
+			}
+			GCM_MUL(ctx,Xi);
+			out += 16;
+			in  += 16;
+			len -= 16;
+		}
+#endif
+		if (len) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+			while (len--) {
+				u8 c = in[n];
+				ctx->Xi.c[n] ^= c;
+				out[n] = c^ctx->EKi.c[n];
+				++n;
+			}
+		}
+
+		ctx->mres = n;
+		return 0;
+	} while(0);
+#endif
+	for (i=0;i<len;++i) {
+		u8 c;
+		if (n==0) {
+			(*block)(ctx->Yi.c,ctx->EKi.c,key);
+			++ctr;
+			if (is_endian.little)
+				PUTU32(ctx->Yi.c+12,ctr);
+			else
+				ctx->Yi.d[3] = ctr;
+		}
+		c = in[i];
+		out[i] = c^ctx->EKi.c[n];
+		ctx->Xi.c[n] ^= c;
+		n = (n+1)%16;
+		if (n==0)
+			GCM_MUL(ctx,Xi);
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len, ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to encrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL(ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		GHASH(ctx,out,GHASH_CHUNK);
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		in  += i;
+		len -= i;
+#if defined(GHASH)
+		GHASH(ctx,out,i);
+		out += i;
+#else
+		while (j--) {
+			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
+			GCM_MUL(ctx,Xi);
+			out += 16;
+		}
+#endif
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+		const unsigned char *in, unsigned char *out,
+		size_t len,ctr128_f stream)
+{
+	const union { long one; char little; } is_endian = {1};
+	unsigned int n, ctr;
+	size_t i;
+	u64   mlen = ctx->len.u[1];
+	void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+# ifdef GHASH
+	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+				const u8 *inp,size_t len)	= ctx->ghash;
+# endif
+#endif
+
+	mlen += len;
+	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
+		return -1;
+	ctx->len.u[1] = mlen;
+
+	if (ctx->ares) {
+		/* First call to decrypt finalizes GHASH(AAD) */
+		GCM_MUL(ctx,Xi);
+		ctx->ares = 0;
+	}
+
+	if (is_endian.little)
+		ctr = GETU32(ctx->Yi.c+12);
+	else
+		ctr = ctx->Yi.d[3];
+
+	n = ctx->mres;
+	if (n) {
+		while (n && len) {
+			u8 c = *(in++);
+			*(out++) = c^ctx->EKi.c[n];
+			ctx->Xi.c[n] ^= c;
+			--len;
+			n = (n+1)%16;
+		}
+		if (n==0) GCM_MUL (ctx,Xi);
+		else {
+			ctx->mres = n;
+			return 0;
+		}
+	}
+#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
+	while (len>=GHASH_CHUNK) {
+		GHASH(ctx,in,GHASH_CHUNK);
+		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
+		ctr += GHASH_CHUNK/16;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += GHASH_CHUNK;
+		in  += GHASH_CHUNK;
+		len -= GHASH_CHUNK;
+	}
+#endif
+	if ((i = (len&(size_t)-16))) {
+		size_t j=i/16;
+
+#if defined(GHASH)
+		GHASH(ctx,in,i);
+#else
+		while (j--) {
+			size_t k;
+			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
+			GCM_MUL(ctx,Xi);
+			in += 16;
+		}
+		j   = i/16;
+		in -= i;
+#endif
+		(*stream)(in,out,j,key,ctx->Yi.c);
+		ctr += (unsigned int)j;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		out += i;
+		in  += i;
+		len -= i;
+	}
+	if (len) {
+		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
+		++ctr;
+		if (is_endian.little)
+			PUTU32(ctx->Yi.c+12,ctr);
+		else
+			ctx->Yi.d[3] = ctr;
+		while (len--) {
+			u8 c = in[n];
+			ctx->Xi.c[n] ^= c;
+			out[n] = c^ctx->EKi.c[n];
+			++n;
+		}
+	}
+
+	ctx->mres = n;
+	return 0;
+}
+
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+			size_t len)
+{
+	const union { long one; char little; } is_endian = {1};
+	u64 alen = ctx->len.u[0]<<3;
+	u64 clen = ctx->len.u[1]<<3;
+#ifdef GCM_FUNCREF_4BIT
+	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
+#endif
+
+	if (ctx->mres)
+		GCM_MUL(ctx,Xi);
+
+	if (is_endian.little) {
+#ifdef BSWAP8
+		alen = BSWAP8(alen);
+		clen = BSWAP8(clen);
+#else
+		u8 *p = ctx->len.c;
+
+		ctx->len.u[0] = alen;
+		ctx->len.u[1] = clen;
+
+		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
+		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
+#endif
+	}
+
+	ctx->Xi.u[0] ^= alen;
+	ctx->Xi.u[1] ^= clen;
+	GCM_MUL(ctx,Xi);
+
+	ctx->Xi.u[0] ^= ctx->EK0.u[0];
+	ctx->Xi.u[1] ^= ctx->EK0.u[1];
+
+	if (tag && len<=sizeof(ctx->Xi))
+		return memcmp(ctx->Xi.c,tag,len);
+	else
+		return -1;
+}
+
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
+{
+	CRYPTO_gcm128_finish(ctx, NULL, 0);
+	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
+}
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
+{
+	GCM128_CONTEXT *ret;
+
+	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
+		CRYPTO_gcm128_init(ret,key,block);
+
+	return ret;
+}
+
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
+{
+	if (ctx) {
+		OPENSSL_cleanse(ctx,sizeof(*ctx));
+		OPENSSL_free(ctx);
+	}
+}
+
+#if defined(SELFTEST)
+#include <stdio.h>
+#include <openssl/aes.h>
+
+/* Test Case 1 */
+static const u8	K1[16],
+		*P1=NULL,
+		*A1=NULL,
+		IV1[12],
+		*C1=NULL,
+		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+
+/* Test Case 2 */
+#define K2 K1
+#define A2 A1
+#define IV2 IV1
+static const u8	P2[16],
+		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
+		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+
+/* Test Case 3 */
+#define A3 A2
+static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
+		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+
+/* Test Case 4 */
+#define K4 K3
+#define IV4 IV3
+static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
+		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+
+/* Test Case 5 */
+#define K5 K4
+#define P5 P4
+#define A5 A4
+static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
+			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
+			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
+			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
+		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+
+/* Test Case 6 */
+#define K6 K5
+#define P6 P5
+#define A6 A5
+static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
+			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
+			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
+			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
+		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+
+/* Test Case 7 */
+static const u8 K7[24],
+		*P7=NULL,
+		*A7=NULL,
+		IV7[12],
+		*C7=NULL,
+		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+
+/* Test Case 8 */
+#define K8 K7
+#define IV8 IV7
+#define A8 A7
+static const u8	P8[16],
+		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
+		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+
+/* Test Case 9 */
+#define A9 A8
+static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
+		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
+		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+
+/* Test Case 10 */
+#define K10 K9
+#define IV10 IV9
+static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
+		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+
+/* Test Case 11 */
+#define K11 K10
+#define P11 P10
+#define A11 A10
+static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
+			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
+			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
+			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
+		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+
+/* Test Case 12 */
+#define K12 K11
+#define P12 P11
+#define A12 A11
+static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
+			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
+			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
+			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
+		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+
+/* Test Case 13 */
+static const u8	K13[32],
+		*P13=NULL,
+		*A13=NULL,
+		IV13[12],
+		*C13=NULL,
+		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+
+/* Test Case 14 */
+#define K14 K13
+#define A14 A13
+static const u8	P14[16],
+		IV14[12],
+		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
+		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+
+/* Test Case 15 */
+#define A15 A14
+static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
+		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
+		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
+		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
+		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+
+/* Test Case 16 */
+#define K16 K15
+#define IV16 IV15
+static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
+		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+			0xab,0xad,0xda,0xd2},
+		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
+		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+
+/* Test Case 17 */
+#define K17 K16
+#define P17 P16
+#define A17 A16
+static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
+		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
+			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
+			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
+			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
+		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+
+/* Test Case 18 */
+#define K18 K17
+#define P18 P17
+#define A18 A17
+static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
+		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
+		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+
+#define TEST_CASE(n)	do {					\
+	u8 out[sizeof(P##n)];					\
+	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (C##n && memcmp(out,C##n,sizeof(out))))		\
+		ret++, printf ("encrypt test#%d failed.\n",n);	\
+	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
+	memset(out,0,sizeof(out));				\
+	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
+	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
+	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
+	    (P##n && memcmp(out,P##n,sizeof(out))))		\
+		ret++, printf ("decrypt test#%d failed.\n",n);	\
+	} while(0)
+
+int main()
+{
+	GCM128_CONTEXT ctx;
+	AES_KEY key;
+	int ret=0;
+
+	TEST_CASE(1);
+	TEST_CASE(2);
+	TEST_CASE(3);
+	TEST_CASE(4);
+	TEST_CASE(5);
+	TEST_CASE(6);
+	TEST_CASE(7);
+	TEST_CASE(8);
+	TEST_CASE(9);
+	TEST_CASE(10);
+	TEST_CASE(11);
+	TEST_CASE(12);
+	TEST_CASE(13);
+	TEST_CASE(14);
+	TEST_CASE(15);
+	TEST_CASE(16);
+	TEST_CASE(17);
+	TEST_CASE(18);
+
+#ifdef OPENSSL_CPUID_OBJ
+	{
+	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
+	union { u64 u; u8 c[1024]; } buf;
+	int i;
+
+	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
+	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
+	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
+
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	start = OPENSSL_rdtsc();
+	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
+			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
+			(block128_f)AES_encrypt);
+	ctr_t = OPENSSL_rdtsc() - start;
+
+	printf("%.2f-%.2f=%.2f\n",
+			gcm_t/(double)sizeof(buf),
+			ctr_t/(double)sizeof(buf),
+			(gcm_t-ctr_t)/(double)sizeof(buf));
+#ifdef GHASH
+	GHASH(&ctx,buf.c,sizeof(buf));
+	start = OPENSSL_rdtsc();
+	for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
+	gcm_t = OPENSSL_rdtsc() - start;
+	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
+#endif
+	}
+#endif
+
+	return ret;
+}
+#endif
diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h
index af8d97d795..f18215bb2b 100644
--- a/src/lib/libcrypto/modes/modes.h
+++ b/src/lib/libcrypto/modes/modes.h
@@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], int enc);
 
+typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out,
+			size_t blocks, const void *key,
+			const unsigned char ivec[16]);
+
+typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out,
+			size_t blocks, const void *key,
+			const unsigned char ivec[16],unsigned char cmac[16]);
+
 void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], block128_f block);
@@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out,
 			unsigned char ivec[16], unsigned char ecount_buf[16],
 			unsigned int *num, block128_f block);
 
+void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], unsigned char ecount_buf[16],
+			unsigned int *num, ctr128_f ctr);
+
 void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], int *num,
@@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out,
 size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out,
 			size_t len, const void *key,
 			unsigned char ivec[16], cbc128_f cbc);
+
+size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc);
+size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], block128_f block);
+size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out,
+			size_t len, const void *key,
+			unsigned char ivec[16], cbc128_f cbc);
+
+typedef struct gcm128_context GCM128_CONTEXT;
+
+GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block);
+void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block);
+void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
+			size_t len);
+int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
+			size_t len);
+int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len);
+int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len);
+int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len, ctr128_f stream);
+int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
+			const unsigned char *in, unsigned char *out,
+			size_t len, ctr128_f stream);
+int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
+			size_t len);
+void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx);
+
+typedef struct ccm128_context CCM128_CONTEXT;
+
+void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx,
+	unsigned int M, unsigned int L, void *key,block128_f block);
+int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx,
+	const unsigned char *nonce, size_t nlen, size_t mlen);
+void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx,
+	const unsigned char *aad, size_t alen);
+int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len);
+int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len,
+	ccm128_f stream);
+int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
+	const unsigned char *inp, unsigned char *out, size_t len,
+	ccm128_f stream);
+size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len);
+
+typedef struct xts128_context XTS128_CONTEXT;
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+	const unsigned char *inp, unsigned char *out, size_t len, int enc);
diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h
new file mode 100644
index 0000000000..b6dc3c336f
--- /dev/null
+++ b/src/lib/libcrypto/modes/modes_lcl.h
@@ -0,0 +1,131 @@
+/* ====================================================================
+ * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use is governed by OpenSSL license.
+ * ====================================================================
+ */
+
+#include <openssl/modes.h>
+
+
+#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
+typedef __int64 i64;
+typedef unsigned __int64 u64;
+#define U64(C) C##UI64
+#elif defined(__arch64__)
+typedef long i64;
+typedef unsigned long u64;
+#define U64(C) C##UL
+#else
+typedef long long i64;
+typedef unsigned long long u64;
+#define U64(C) C##ULL
+#endif
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#define STRICT_ALIGNMENT 1
+#if defined(__i386)	|| defined(__i386__)	|| \
+    defined(__x86_64)	|| defined(__x86_64__)	|| \
+    defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64) || \
+    defined(__s390__)	|| defined(__s390x__)	|| \
+    ( (defined(__arm__)	|| defined(__arm)) && \
+      (defined(__ARM_ARCH_7__)	|| defined(__ARM_ARCH_7A__) || \
+       defined(__ARM_ARCH_7R__)	|| defined(__ARM_ARCH_7M__)) )
+# undef STRICT_ALIGNMENT
+#endif
+
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+#if defined(__GNUC__) && __GNUC__>=2
+# if defined(__x86_64) || defined(__x86_64__)
+#  define BSWAP8(x) ({	u64 ret=(x);			\
+			asm ("bswapq %0"		\
+			: "+r"(ret));	ret;		})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("bswapl %0; bswapl %1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm ("bswapl %0"		\
+			: "+r"(ret));	ret;		})
+# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT)
+#  define BSWAP8(x) ({	u32 lo=(u64)(x)>>32,hi=(x);	\
+			asm ("rev %0,%0; rev %1,%1"	\
+			: "+r"(hi),"+r"(lo));		\
+			(u64)hi<<32|lo;			})
+#  define BSWAP4(x) ({	u32 ret;			\
+			asm ("rev %0,%1"		\
+			: "=r"(ret) : "r"((u32)(x)));	\
+			ret;				})
+# endif
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+#  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
+#  define BSWAP8(x)	_byteswap_uint64((u64)(x))
+#  define BSWAP4(x)	_byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+   __inline u32 _bswap4(u32 val) {
+	_asm mov eax,val
+	_asm bswap eax
+   }
+#  define BSWAP4(x)	_bswap4(x)
+# endif
+#endif
+#endif
+
+#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT)
+#define GETU32(p)	BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+
+/* GCM definitions */
+
+typedef struct { u64 hi,lo; } u128;
+
+#ifdef	TABLE_BITS
+#undef	TABLE_BITS
+#endif
+/*
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8 [or 1]. For further information see gcm128.c.
+ */
+#define	TABLE_BITS 4
+
+struct gcm128_context {
+	/* Following 6 names follow names in GCM specification */
+	union { u64 u[2]; u32 d[4]; u8 c[16]; }	Yi,EKi,EK0,len,
+						Xi,H;
+	/* Relative position of Xi, H and pre-computed Htable is used
+	 * in some assembler modules, i.e. don't change the order! */
+#if TABLE_BITS==8
+	u128 Htable[256];
+#else
+	u128 Htable[16];
+	void (*gmult)(u64 Xi[2],const u128 Htable[16]);
+	void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+	unsigned int mres, ares;
+	block128_f block;
+	void *key;
+};
+
+struct xts128_context {
+	void      *key1, *key2;
+	block128_f block1,block2;
+};
+
+struct ccm128_context {
+	union { u64 u[2]; u8 c[16]; } nonce, cmac;
+	u64 blocks;
+	block128_f block;
+	void *key;
+};
+
diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c
index c732e2ec58..01c01702c4 100644
--- a/src/lib/libcrypto/modes/ofb128.c
+++ b/src/lib/libcrypto/modes/ofb128.c
@@ -48,7 +48,8 @@
  *
  */
 
-#include "modes.h"
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
 #include <string.h>
 
 #ifndef MODES_DEBUG
@@ -58,14 +59,6 @@
 #endif
 #include <assert.h>
 
-#define STRICT_ALIGNMENT
-#if defined(__i386) || defined(__i386__) || \
-    defined(__x86_64) || defined(__x86_64__) || \
-    defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \
-    defined(__s390__) || defined(__s390x__)
-#  undef STRICT_ALIGNMENT
-#endif
-
 /* The input and output encrypted as though 128bit ofb mode is being
  * used.  The extra state information to record how much of the
  * 128bit block we have used is contained in *num;
diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c
new file mode 100644
index 0000000000..9cf27a25e9
--- /dev/null
+++ b/src/lib/libcrypto/modes/xts128.c
@@ -0,0 +1,187 @@
+/* ====================================================================
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <openssl/crypto.h>
+#include "modes_lcl.h"
+#include <string.h>
+
+#ifndef MODES_DEBUG
+# ifndef NDEBUG
+#  define NDEBUG
+# endif
+#endif
+#include <assert.h>
+
+int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16],
+	const unsigned char *inp, unsigned char *out,
+	size_t len, int enc)
+{
+	const union { long one; char little; } is_endian = {1};
+	union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch;
+	unsigned int i;
+
+	if (len<16) return -1;
+
+	memcpy(tweak.c, iv, 16);
+
+	(*ctx->block2)(tweak.c,tweak.c,ctx->key2);
+
+	if (!enc && (len%16)) len-=16;
+
+	while (len>=16) {
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^=tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^=tweak.u[1];
+#endif
+		inp += 16;
+		out += 16;
+		len -= 16;
+
+		if (len==0)	return 0;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+			
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak.u[0] = (tweak.u[0]<<1)^res;
+			tweak.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak.c[0] ^= (u8)(0x87&(0-c));
+		}
+	}
+	if (enc) {
+		for (i=0;i<len;++i) {
+			u8 c = inp[i];
+			out[i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy(out-16,scratch.c,16);
+	}
+	else {
+		union { u64 u[2]; u8 c[16]; } tweak1;
+
+		if (is_endian.little) {
+			unsigned int carry,res;
+
+			res = 0x87&(((int)tweak.d[3])>>31);
+			carry = (unsigned int)(tweak.u[0]>>63);
+			tweak1.u[0] = (tweak.u[0]<<1)^res;
+			tweak1.u[1] = (tweak.u[1]<<1)|carry;
+		}
+		else {
+			size_t c;
+
+			for (c=0,i=0;i<16;++i) {
+				/*+ substitutes for |, because c is 1 bit */ 
+				c += ((size_t)tweak.c[i])<<1;
+				tweak1.c[i] = (u8)c;
+				c = c>>8;
+			}
+			tweak1.c[0] ^= (u8)(0x87&(0-c));
+		}
+#if defined(STRICT_ALIGNMENT)
+		memcpy(scratch.c,inp,16);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+#else
+		scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0];
+		scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1];
+#endif
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+		scratch.u[0] ^= tweak1.u[0];
+		scratch.u[1] ^= tweak1.u[1];
+
+		for (i=0;i<len;++i) {
+			u8 c = inp[16+i];
+			out[16+i] = scratch.c[i];
+			scratch.c[i] = c;
+		}
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		(*ctx->block1)(scratch.c,scratch.c,ctx->key1);
+#if defined(STRICT_ALIGNMENT)
+		scratch.u[0] ^= tweak.u[0];
+		scratch.u[1] ^= tweak.u[1];
+		memcpy (out,scratch.c,16);
+#else
+		((u64*)out)[0] = scratch.u[0]^tweak.u[0];
+		((u64*)out)[1] = scratch.u[1]^tweak.u[1];
+#endif
+	}
+
+	return 0;
+}
diff --git a/src/lib/libcrypto/o_init.c b/src/lib/libcrypto/o_init.c
index 00ed65a6cf..db4cdc443b 100644
--- a/src/lib/libcrypto/o_init.c
+++ b/src/lib/libcrypto/o_init.c
@@ -3,7 +3,7 @@
  * project.
  */
 /* ====================================================================
- * Copyright (c) 2007 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -50,14 +50,14 @@
  * OF THE POSSIBILITY OF SUCH DAMAGE.
  * ====================================================================
  *
- * This product includes cryptographic software written by Eric Young
- * (eay@cryptsoft.com).  This product includes software written by Tim
- * Hudson (tjh@cryptsoft.com).
- *
  */
 
 #include <e_os.h>
 #include <openssl/err.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/rand.h>
+#endif
 
 /* Perform any essential OpenSSL initialization operations.
  * Currently only sets FIPS callbacks
@@ -65,22 +65,18 @@
 
 void OPENSSL_init(void)
 	{
-#ifdef OPENSSL_FIPS
 	static int done = 0;
-	if (!done)
-		{
-		int_ERR_lib_init();
-#ifdef CRYPTO_MDEBUG
-		CRYPTO_malloc_debug_init();
-#endif
-#ifdef OPENSSL_ENGINE
-		int_EVP_MD_init_engine_callbacks();
-		int_EVP_CIPHER_init_engine_callbacks();
-		int_RAND_init_engine_callbacks();
+	if (done)
+		return;
+	done = 1;
+#ifdef OPENSSL_FIPS
+	FIPS_set_locking_callbacks(CRYPTO_lock, CRYPTO_add_lock);
+	FIPS_set_error_callbacks(ERR_put_error, ERR_add_error_vdata);
+	FIPS_set_malloc_callbacks(CRYPTO_malloc, CRYPTO_free);
+	RAND_init_fips();
 #endif
-		done = 1;
-		}
+#if 0
+	fprintf(stderr, "Called OPENSSL_init\n");
 #endif
 	}
-		
 
diff --git a/src/lib/libcrypto/objects/obj_mac.num b/src/lib/libcrypto/objects/obj_mac.num
index 8c50aac27f..1d0a7c802d 100644
--- a/src/lib/libcrypto/objects/obj_mac.num
+++ b/src/lib/libcrypto/objects/obj_mac.num
@@ -890,3 +890,30 @@ houseIdentifier		889
 supportedAlgorithms		890
 deltaRevocationList		891
 dmdName		892
+id_alg_PWRI_KEK		893
+cmac		894
+aes_128_gcm		895
+aes_128_ccm		896
+id_aes128_wrap_pad		897
+aes_192_gcm		898
+aes_192_ccm		899
+id_aes192_wrap_pad		900
+aes_256_gcm		901
+aes_256_ccm		902
+id_aes256_wrap_pad		903
+aes_128_ctr		904
+aes_192_ctr		905
+aes_256_ctr		906
+id_camellia128_wrap		907
+id_camellia192_wrap		908
+id_camellia256_wrap		909
+anyExtendedKeyUsage		910
+mgf1		911
+rsassaPss		912
+aes_128_xts		913
+aes_256_xts		914
+rc4_hmac_md5		915
+aes_128_cbc_hmac_sha1		916
+aes_192_cbc_hmac_sha1		917
+aes_256_cbc_hmac_sha1		918
+rsaesOaep		919
diff --git a/src/lib/libcrypto/objects/obj_xref.c b/src/lib/libcrypto/objects/obj_xref.c
index 152eca5c67..9f744bcede 100644
--- a/src/lib/libcrypto/objects/obj_xref.c
+++ b/src/lib/libcrypto/objects/obj_xref.c
@@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid)
 #endif
 	if (rv == NULL)
 		return 0;
-	*pdig_nid = rv->hash_id;
-	*ppkey_nid = rv->pkey_id;
+	if (pdig_nid)
+		*pdig_nid = rv->hash_id;
+	if (ppkey_nid)
+		*ppkey_nid = rv->pkey_id;
 	return 1;
 	}
 
@@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid)
 #endif
 	if (rv == NULL)
 		return 0;
-	*psignid = (*rv)->sign_id;
+	if (psignid)
+		*psignid = (*rv)->sign_id;
 	return 1;
 	}
 
diff --git a/src/lib/libcrypto/objects/obj_xref.h b/src/lib/libcrypto/objects/obj_xref.h
index d5b9b8e198..e23938c296 100644
--- a/src/lib/libcrypto/objects/obj_xref.h
+++ b/src/lib/libcrypto/objects/obj_xref.h
@@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] =
 	{NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94},
 	{NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc},
 	{NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc},
+	{NID_rsassaPss, NID_undef, NID_rsaEncryption},
 	};
 
 static const nid_triple * const sigoid_srt_xref[] =
 	{
+	&sigoid_srt[29],
 	&sigoid_srt[17],
 	&sigoid_srt[18],
 	&sigoid_srt[0],
diff --git a/src/lib/libcrypto/objects/obj_xref.txt b/src/lib/libcrypto/objects/obj_xref.txt
index e45b3d34b9..cb917182ee 100644
--- a/src/lib/libcrypto/objects/obj_xref.txt
+++ b/src/lib/libcrypto/objects/obj_xref.txt
@@ -13,6 +13,10 @@ sha512WithRSAEncryption	sha512	rsaEncryption
 sha224WithRSAEncryption	sha224	rsaEncryption
 mdc2WithRSA		mdc2	rsaEncryption
 ripemd160WithRSA	ripemd160 rsaEncryption
+# For PSS the digest algorithm can vary and depends on the included
+# AlgorithmIdentifier. The digest "undef" indicates the public key
+# method should handle this explicitly.
+rsassaPss		undef	rsaEncryption
 
 # Alternative deprecated OIDs. By using the older "rsa" OID this
 # type will be recognized by not normally used.
diff --git a/src/lib/libcrypto/objects/objects.txt b/src/lib/libcrypto/objects/objects.txt
index e61fe60cbf..d3bfad72a2 100644
--- a/src/lib/libcrypto/objects/objects.txt
+++ b/src/lib/libcrypto/objects/objects.txt
@@ -166,6 +166,10 @@ pkcs1 3			: RSA-MD4		: md4WithRSAEncryption
 pkcs1 4			: RSA-MD5		: md5WithRSAEncryption
 pkcs1 5			: RSA-SHA1		: sha1WithRSAEncryption
 # According to PKCS #1 version 2.1
+pkcs1 7			: RSAES-OAEP		: rsaesOaep
+pkcs1 8			: MGF1			: mgf1
+pkcs1 10		: RSASSA-PSS		: rsassaPss
+
 pkcs1 11		: RSA-SHA256		: sha256WithRSAEncryption
 pkcs1 12		: RSA-SHA384		: sha384WithRSAEncryption
 pkcs1 13		: RSA-SHA512		: sha512WithRSAEncryption
@@ -299,6 +303,7 @@ id-smime-alg 4		: id-smime-alg-RC2wrap
 id-smime-alg 5		: id-smime-alg-ESDH
 id-smime-alg 6		: id-smime-alg-CMS3DESwrap
 id-smime-alg 7		: id-smime-alg-CMSRC2wrap
+id-smime-alg 9		: id-alg-PWRI-KEK
 
 # S/MIME Certificate Distribution
 id-smime-cd 1		: id-smime-cd-ldap
@@ -770,6 +775,10 @@ id-ce 55		: targetInformation	: X509v3 AC Targeting
 !Cname no-rev-avail
 id-ce 56		: noRevAvail		: X509v3 No Revocation Available
 
+# From RFC5280
+ext-key-usage 0		: anyExtendedKeyUsage	: Any Extended Key Usage
+
+
 !Cname netscape
 2 16 840 1 113730	: Netscape		: Netscape Communications Corp.
 !Cname netscape-cert-extension
@@ -846,6 +855,10 @@ aes 2			: AES-128-CBC		: aes-128-cbc
 aes 3			: AES-128-OFB		: aes-128-ofb
 !Cname aes-128-cfb128
 aes 4			: AES-128-CFB		: aes-128-cfb
+aes 5			: id-aes128-wrap
+aes 6			: id-aes128-GCM		: aes-128-gcm
+aes 7			: id-aes128-CCM		: aes-128-ccm
+aes 8			: id-aes128-wrap-pad
 
 aes 21			: AES-192-ECB		: aes-192-ecb
 aes 22			: AES-192-CBC		: aes-192-cbc
@@ -853,6 +866,10 @@ aes 22			: AES-192-CBC		: aes-192-cbc
 aes 23			: AES-192-OFB		: aes-192-ofb
 !Cname aes-192-cfb128
 aes 24			: AES-192-CFB		: aes-192-cfb
+aes 25			: id-aes192-wrap
+aes 26			: id-aes192-GCM		: aes-192-gcm
+aes 27			: id-aes192-CCM		: aes-192-ccm
+aes 28			: id-aes192-wrap-pad
 
 aes 41			: AES-256-ECB		: aes-256-ecb
 aes 42			: AES-256-CBC		: aes-256-cbc
@@ -860,6 +877,10 @@ aes 42			: AES-256-CBC		: aes-256-cbc
 aes 43			: AES-256-OFB		: aes-256-ofb
 !Cname aes-256-cfb128
 aes 44			: AES-256-CFB		: aes-256-cfb
+aes 45			: id-aes256-wrap
+aes 46			: id-aes256-GCM		: aes-256-gcm
+aes 47			: id-aes256-CCM		: aes-256-ccm
+aes 48			: id-aes256-wrap-pad
 
 # There are no OIDs for these modes...
 
@@ -869,15 +890,16 @@ aes 44			: AES-256-CFB		: aes-256-cfb
 			: AES-128-CFB8		: aes-128-cfb8
 			: AES-192-CFB8		: aes-192-cfb8
 			: AES-256-CFB8		: aes-256-cfb8
+			: AES-128-CTR		: aes-128-ctr
+			: AES-192-CTR		: aes-192-ctr
+			: AES-256-CTR		: aes-256-ctr
+			: AES-128-XTS		: aes-128-xts
+			: AES-256-XTS		: aes-256-xts
 			: DES-CFB1		: des-cfb1
 			: DES-CFB8		: des-cfb8
 			: DES-EDE3-CFB1		: des-ede3-cfb1
 			: DES-EDE3-CFB8		: des-ede3-cfb8
 
-aes 5			: id-aes128-wrap 
-aes 25			: id-aes192-wrap 
-aes 45			: id-aes256-wrap 
-
 # OIDs for SHA224, SHA256, SHA385 and SHA512, according to x9.84.
 !Alias nist_hashalgs nistAlgorithms 2
 nist_hashalgs 1		: SHA256		: sha256
@@ -1211,6 +1233,9 @@ cryptocom 1 8 1		: id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se
 1 2 392 200011 61 1 1 1 2 : CAMELLIA-128-CBC		: camellia-128-cbc
 1 2 392 200011 61 1 1 1 3 : CAMELLIA-192-CBC		: camellia-192-cbc
 1 2 392 200011 61 1 1 1 4 : CAMELLIA-256-CBC		: camellia-256-cbc
+1 2 392 200011 61 1 1 3 2 : id-camellia128-wrap
+1 2 392 200011 61 1 1 3 3 : id-camellia192-wrap
+1 2 392 200011 61 1 1 3 4 : id-camellia256-wrap
 
 # Definitions for Camellia cipher - ECB, CFB, OFB MODE
 
@@ -1257,3 +1282,11 @@ kisa 1 6                : SEED-OFB      : seed-ofb
 # There is no OID that just denotes "HMAC" oddly enough...
 
 			: HMAC				: hmac
+# Nor CMAC either
+			: CMAC				: cmac
+
+# Synthetic composite ciphersuites
+			: RC4-HMAC-MD5			: rc4-hmac-md5
+			: AES-128-CBC-HMAC-SHA1		: aes-128-cbc-hmac-sha1
+			: AES-192-CBC-HMAC-SHA1		: aes-192-cbc-hmac-sha1
+			: AES-256-CBC-HMAC-SHA1		: aes-256-cbc-hmac-sha1
diff --git a/src/lib/libcrypto/ocsp/ocsp_lib.c b/src/lib/libcrypto/ocsp/ocsp_lib.c
index e92b86c060..a94dc838ee 100644
--- a/src/lib/libcrypto/ocsp/ocsp_lib.c
+++ b/src/lib/libcrypto/ocsp/ocsp_lib.c
@@ -124,7 +124,8 @@ OCSP_CERTID *OCSP_cert_id_new(const EVP_MD *dgst,
 	if (!(ASN1_OCTET_STRING_set(cid->issuerNameHash, md, i))) goto err;
 
 	/* Calculate the issuerKey hash, excluding tag and length */
-	EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL);
+	if (!EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL))
+		goto err;
 
 	if (!(ASN1_OCTET_STRING_set(cid->issuerKeyHash, md, i))) goto err;
 
diff --git a/src/lib/libcrypto/opensslv.h b/src/lib/libcrypto/opensslv.h
index d6d61a0c7d..71be3590af 100644
--- a/src/lib/libcrypto/opensslv.h
+++ b/src/lib/libcrypto/opensslv.h
@@ -25,11 +25,11 @@
  * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for
  *  major minor fix final patch/beta)
  */
-#define OPENSSL_VERSION_NUMBER	0x1000006fL
+#define OPENSSL_VERSION_NUMBER	0x1000103fL
 #ifdef OPENSSL_FIPS
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0f-fips 4 Jan 2012"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1c-fips 10 May 2012"
 #else
-#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.0f 4 Jan 2012"
+#define OPENSSL_VERSION_TEXT	"OpenSSL 1.0.1c 10 May 2012"
 #endif
 #define OPENSSL_VERSION_PTEXT	" part of " OPENSSL_VERSION_TEXT
 
diff --git a/src/lib/libcrypto/ossl_typ.h b/src/lib/libcrypto/ossl_typ.h
index 12bd7014de..ea9227f6f9 100644
--- a/src/lib/libcrypto/ossl_typ.h
+++ b/src/lib/libcrypto/ossl_typ.h
@@ -91,10 +91,12 @@ typedef struct asn1_string_st ASN1_TIME;
 typedef struct asn1_string_st ASN1_GENERALIZEDTIME;
 typedef struct asn1_string_st ASN1_VISIBLESTRING;
 typedef struct asn1_string_st ASN1_UTF8STRING;
+typedef struct asn1_string_st ASN1_STRING;
 typedef int ASN1_BOOLEAN;
 typedef int ASN1_NULL;
 #endif
 
+typedef struct ASN1_ITEM_st ASN1_ITEM;
 typedef struct asn1_pctx_st ASN1_PCTX;
 
 #ifdef OPENSSL_SYS_WIN32
diff --git a/src/lib/libcrypto/pariscid.pl b/src/lib/libcrypto/pariscid.pl
new file mode 100644
index 0000000000..477ec9b87d
--- /dev/null
+++ b/src/lib/libcrypto/pariscid.pl
@@ -0,0 +1,224 @@
+#!/usr/bin/env perl
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$ST		="std";
+} else {
+	$LEVEL		="1.1";
+	$SIZE_T		=4;
+	$ST		="stw";
+}
+
+$rp="%r2";
+$sp="%r30";
+$rv="%r28";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	OPENSSL_cpuid_setup,ENTRY
+	.ALIGN	8
+OPENSSL_cpuid_setup
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_rdtsc,ENTRY
+	.ALIGN	8
+OPENSSL_rdtsc
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	mfctl	%cr16,$rv
+	bv	($rp)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	OPENSSL_wipe_cpu,ENTRY
+	.ALIGN	8
+OPENSSL_wipe_cpu
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	xor		%r0,%r0,%r1
+	fcpy,dbl	%fr0,%fr4
+	xor		%r0,%r0,%r19
+	fcpy,dbl	%fr0,%fr5
+	xor		%r0,%r0,%r20
+	fcpy,dbl	%fr0,%fr6
+	xor		%r0,%r0,%r21
+	fcpy,dbl	%fr0,%fr7
+	xor		%r0,%r0,%r22
+	fcpy,dbl	%fr0,%fr8
+	xor		%r0,%r0,%r23
+	fcpy,dbl	%fr0,%fr9
+	xor		%r0,%r0,%r24
+	fcpy,dbl	%fr0,%fr10
+	xor		%r0,%r0,%r25
+	fcpy,dbl	%fr0,%fr11
+	xor		%r0,%r0,%r26
+	fcpy,dbl	%fr0,%fr22
+	xor		%r0,%r0,%r29
+	fcpy,dbl	%fr0,%fr23
+	xor		%r0,%r0,%r31
+	fcpy,dbl	%fr0,%fr24
+	fcpy,dbl	%fr0,%fr25
+	fcpy,dbl	%fr0,%fr26
+	fcpy,dbl	%fr0,%fr27
+	fcpy,dbl	%fr0,%fr28
+	fcpy,dbl	%fr0,%fr29
+	fcpy,dbl	%fr0,%fr30
+	fcpy,dbl	%fr0,%fr31
+	bv		($rp)
+	.EXIT
+	ldo		0($sp),$rv
+	.PROCEND
+___
+{
+my $inp="%r26";
+my $len="%r25";
+
+$code.=<<___;
+	.EXPORT	OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_cleanse
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	cmpib,*=	0,$len,Ldone
+	nop
+	cmpib,*>>=	15,$len,Little
+	ldi		$SIZE_T-1,%r1
+
+Lalign
+	and,*<>		$inp,%r1,%r28
+	b,n		Laligned
+	stb		%r0,0($inp)
+	ldo		-1($len),$len
+	b		Lalign
+	ldo		1($inp),$inp
+
+Laligned
+	andcm		$len,%r1,%r28
+Lot
+	$ST		%r0,0($inp)
+	addib,*<>	-$SIZE_T,%r28,Lot
+	ldo		$SIZE_T($inp),$inp
+
+	and,*<>		$len,%r1,$len
+	b,n		Ldone
+Little
+	stb		%r0,0($inp)
+	addib,*<>	-1,$len,Little
+	ldo		1($inp),$inp
+Ldone
+	bv		($rp)
+	.EXIT
+	nop
+	.PROCEND
+___
+}
+{
+my ($out,$cnt,$max)=("%r26","%r25","%r24");
+my ($tick,$lasttick)=("%r23","%r22");
+my ($diff,$lastdiff)=("%r21","%r20");
+
+$code.=<<___;
+	.EXPORT	OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+Loop
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,<>	-1,$cnt,Loop
+	addi		4,$out,$out
+
+	bv		($rp)
+	.EXIT
+	sub		$rv,$cnt,$rv
+	.PROCEND
+
+	.EXPORT	OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
+	.ALIGN	8
+OPENSSL_instrument_bus2
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	copy		$cnt,$rv
+	sub		%r0,$cnt,$cnt
+
+	mfctl		%cr16,$tick
+	copy		$tick,$lasttick
+	ldi		0,$diff
+
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+Loop2
+	copy		$diff,$lastdiff
+	fdc		0($out)
+	ldw		0($out),$tick
+	add		$diff,$tick,$tick
+	stw		$tick,0($out)
+
+	addib,=		-1,$max,Ldone2
+	nop
+
+	mfctl		%cr16,$tick
+	sub		$tick,$lasttick,$diff
+	copy		$tick,$lasttick
+	cmpclr,<>	$lastdiff,$diff,$tick
+	ldi		1,$tick
+
+	ldi		1,%r1
+	xor		%r1,$tick,$tick
+	addb,<>		$tick,$cnt,Loop2
+	shladd,l	$tick,2,$out,$out
+Ldone2
+	bv		($rp)
+	.EXIT
+	add		$rv,$cnt,$rv
+	.PROCEND
+___
+}
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
+
diff --git a/src/lib/libcrypto/pem/pvkfmt.c b/src/lib/libcrypto/pem/pvkfmt.c
index 5f130c4528..b1bf71a5da 100644
--- a/src/lib/libcrypto/pem/pvkfmt.c
+++ b/src/lib/libcrypto/pem/pvkfmt.c
@@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key,
 			const unsigned char *pass, int passlen)
 	{
 	EVP_MD_CTX mctx;
+	int rv = 1;
 	EVP_MD_CTX_init(&mctx);
-	EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL);
-	EVP_DigestUpdate(&mctx, salt, saltlen);
-	EVP_DigestUpdate(&mctx, pass, passlen);
-	EVP_DigestFinal_ex(&mctx, key, NULL);
+	if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL)
+		|| !EVP_DigestUpdate(&mctx, salt, saltlen)
+		|| !EVP_DigestUpdate(&mctx, pass, passlen)
+		|| !EVP_DigestFinal_ex(&mctx, key, NULL))
+			rv = 0;
+
 	EVP_MD_CTX_cleanup(&mctx);
-	return 1;
+	return rv;
 	}
 	
 
@@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
 	const unsigned char *p = *in;
 	unsigned int magic;
 	unsigned char *enctmp = NULL, *q;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (saltlen)
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,0,u);
@@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in,
 		p += 8;
 		inlen = keylen - 8;
 		q = enctmp + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
-		EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-		EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen);
+		if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto err;
+		if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+			goto err;
+		if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen))
+			goto err;
 		magic = read_ledword((const unsigned char **)&q);
 		if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 			{
 			q = enctmp + 8;
 			memset(keybuf + 5, 0, 11);
-			EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
-								NULL);
+			if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf,
+								NULL))
+				goto err;
 			OPENSSL_cleanse(keybuf, 20);
-			EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen);
-			EVP_DecryptFinal_ex(&cctx, q + enctmplen,
-								&enctmplen);
+			if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen))
+				goto err;
+			if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen,
+								&enctmplen))
+				goto err;
 			magic = read_ledword((const unsigned char **)&q);
 			if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC)
 				{
-				EVP_CIPHER_CTX_cleanup(&cctx);
 				PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT);
 				goto err;
 				}
 			}
 		else
 			OPENSSL_cleanse(keybuf, 20);
-		EVP_CIPHER_CTX_cleanup(&cctx);
 		p = enctmp;
 		}
 
 	ret = b2i_PrivateKey(&p, keylen);
 	err:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	if (enctmp && saltlen)
 		OPENSSL_free(enctmp);
 	return ret;
@@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 	{
 	int outlen = 24, pklen;
 	unsigned char *p, *salt = NULL;
+	EVP_CIPHER_CTX cctx;
+	EVP_CIPHER_CTX_init(&cctx);
 	if (enclevel)
 		outlen += PVK_SALTLEN;
 	pklen = do_i2b(NULL, pk, 0);
@@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 		{
 		char psbuf[PEM_BUFSIZE];
 		unsigned char keybuf[20];
-		EVP_CIPHER_CTX cctx;
 		int enctmplen, inlen;
 		if (cb)
 			inlen=cb(psbuf,PEM_BUFSIZE,1,u);
@@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel,
 		if (enclevel == 1)
 			memset(keybuf + 5, 0, 11);
 		p = salt + PVK_SALTLEN + 8;
-		EVP_CIPHER_CTX_init(&cctx);
-		EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL);
+		if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL))
+			goto error;
 		OPENSSL_cleanse(keybuf, 20);
-		EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8);
-		EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen);
-		EVP_CIPHER_CTX_cleanup(&cctx);
+		if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8))
+			goto error;
+		if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen))
+			goto error;
 		}
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return outlen;
 
 	error:
+	EVP_CIPHER_CTX_cleanup(&cctx);
 	return -1;
 	}
 
diff --git a/src/lib/libcrypto/perlasm/ppc-xlate.pl b/src/lib/libcrypto/perlasm/ppc-xlate.pl
index 4579671c97..a3edd982b6 100755
--- a/src/lib/libcrypto/perlasm/ppc-xlate.pl
+++ b/src/lib/libcrypto/perlasm/ppc-xlate.pl
@@ -31,10 +31,9 @@ my $globl = sub {
 				$ret .= ".type	$name,\@function";
 				last;
 			      };
-	/linux.*64/	&& do {	$ret .= ".globl	.$name\n";
-				$ret .= ".type	.$name,\@function\n";
+	/linux.*64/	&& do {	$ret .= ".globl	$name\n";
+				$ret .= ".type	$name,\@function\n";
 				$ret .= ".section	\".opd\",\"aw\"\n";
-				$ret .= ".globl	$name\n";
 				$ret .= ".align	3\n";
 				$ret .= "$name:\n";
 				$ret .= ".quad	.$name,.TOC.\@tocbase,0\n";
@@ -62,6 +61,14 @@ my $machine = sub {
     }
     ".machine	$arch";
 };
+my $size = sub {
+    if ($flavour =~ /linux.*32/)
+    {	shift;
+	".size	" . join(",",@_);
+    }
+    else
+    {	"";	}
+};
 my $asciz = sub {
     shift;
     my $line = join(",",@_);
diff --git a/src/lib/libcrypto/perlasm/x86_64-xlate.pl b/src/lib/libcrypto/perlasm/x86_64-xlate.pl
index e47116b74b..56d9b64b6f 100755
--- a/src/lib/libcrypto/perlasm/x86_64-xlate.pl
+++ b/src/lib/libcrypto/perlasm/x86_64-xlate.pl
@@ -62,12 +62,8 @@ my $flavour = shift;
 my $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 
-{ my ($stddev,$stdino,@junk)=stat(STDOUT);
-  my ($outdev,$outino,@junk)=stat($output);
-
-    open STDOUT,">$output" || die "can't open $output: $!"
-	if ($stddev!=$outdev || $stdino!=$outino);
-}
+open STDOUT,">$output" || die "can't open $output: $!"
+	if (defined($output));
 
 my $gas=1;	$gas=0 if ($output =~ /\.asm$/);
 my $elf=1;	$elf=0 if (!$gas);
@@ -116,12 +112,16 @@ my %globals;
 	    $line = substr($line,@+[0]); $line =~ s/^\s+//;
 
 	    undef $self->{sz};
-	    if ($self->{op} =~ /^(movz)b.*/) {	# movz is pain...
+	    if ($self->{op} =~ /^(movz)x?([bw]).*/) {	# movz is pain...
 		$self->{op} = $1;
-		$self->{sz} = "b";
+		$self->{sz} = $2;
 	    } elsif ($self->{op} =~ /call|jmp/) {
 		$self->{sz} = "";
-	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op)/) { # SSEn
+	    } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /^v/) { # VEX
+		$self->{sz} = "";
+	    } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) {
 		$self->{sz} = "";
 	    } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) {
 		$self->{op} = $1;
@@ -246,35 +246,39 @@ my %globals;
 	$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 	$self->{base}  =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
 
+	# Solaris /usr/ccs/bin/as can't handle multiplications
+	# in $self->{label}, new gas requires sign extension...
+	use integer;
+	$self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
+	$self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
+	$self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
+
 	if ($gas) {
-	    # Solaris /usr/ccs/bin/as can't handle multiplications
-	    # in $self->{label}, new gas requires sign extension...
-	    use integer;
-	    $self->{label} =~ s/(?<![\w\$\.])(0x?[0-9a-f]+)/oct($1)/egi;
-	    $self->{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg;
-	    $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg;
 	    $self->{label} =~ s/^___imp_/__imp__/   if ($flavour eq "mingw64");
 
 	    if (defined($self->{index})) {
-		sprintf "%s%s(%%%s,%%%s,%d)",$self->{asterisk},
-					$self->{label},$self->{base},
+		sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk},
+					$self->{label},
+					$self->{base}?"%$self->{base}":"",
 					$self->{index},$self->{scale};
 	    } else {
 		sprintf "%s%s(%%%s)",	$self->{asterisk},$self->{label},$self->{base};
 	    }
 	} else {
-	    %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", q=>"QWORD$PTR" );
+	    %szmap = (	b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR",
+	    		q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" );
 
 	    $self->{label} =~ s/\./\$/g;
 	    $self->{label} =~ s/(?<![\w\$\.])0x([0-9a-f]+)/0$1h/ig;
 	    $self->{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/);
-	    $sz="q" if ($self->{asterisk});
+	    $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq");
+	    $sz="l" if (opcode->mnemonic() eq "movd");
 
 	    if (defined($self->{index})) {
-		sprintf "%s[%s%s*%d+%s]",$szmap{$sz},
+		sprintf "%s[%s%s*%d%s]",$szmap{$sz},
 					$self->{label}?"$self->{label}+":"",
 					$self->{index},$self->{scale},
-					$self->{base};
+					$self->{base}?"+$self->{base}":"";
 	    } elsif ($self->{base} eq "rip") {
 		sprintf "%s[%s]",$szmap{$sz},$self->{label};
 	    } else {
@@ -506,6 +510,12 @@ my %globals;
 		    }
 		} elsif ($dir =~ /\.(text|data)/) {
 		    $current_segment=".$1";
+		} elsif ($dir =~ /\.hidden/) {
+		    if    ($flavour eq "macosx")  { $self->{value} = ".private_extern\t$prefix$line"; }
+		    elsif ($flavour eq "mingw64") { $self->{value} = ""; }
+		} elsif ($dir =~ /\.comm/) {
+		    $self->{value} = "$dir\t$prefix$line";
+		    $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx");
 		}
 		$line = "";
 		return $self;
@@ -555,7 +565,8 @@ my %globals;
 					    $v.=" READONLY";
 					    $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref);
 					} elsif ($line=~/\.CRT\$/i) {
-					    $v.=" READONLY DWORD";
+					    $v.=" READONLY ";
+					    $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD";
 					}
 				    }
 				    $current_segment = $line;
@@ -577,7 +588,7 @@ my %globals;
 					    $self->{value}="${decor}SEH_end_$current_function->{name}:";
 					    $self->{value}.=":\n" if($masm);
 					}
-					$self->{value}.="$current_function->{name}\tENDP" if($masm);
+					$self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name});
 					undef $current_function;
 				    }
 				    last;
@@ -613,6 +624,19 @@ my %globals;
 						.join(",",@str) if (@str);
 				    last;
 				  };
+		/\.comm/    && do { my @str=split(/,\s*/,$line);
+				    my $v=undef;
+				    if ($nasm) {
+					$v.="common	$prefix@str[0] @str[1]";
+				    } else {
+					$v="$current_segment\tENDS\n" if ($current_segment);
+					$current_segment = "_DATA";
+					$v.="$current_segment\tSEGMENT\n";
+					$v.="COMM	@str[0]:DWORD:".@str[1]/4;
+				    }
+				    $self->{value} = $v;
+				    last;
+				  };
 	    }
 	    $line = "";
 	}
@@ -625,9 +649,133 @@ my %globals;
     }
 }
 
+sub rex {
+ local *opcode=shift;
+ my ($dst,$src,$rex)=@_;
+
+   $rex|=0x04 if($dst>=8);
+   $rex|=0x01 if($src>=8);
+   push @opcode,($rex|0x40) if ($rex);
+}
+
+# older gas and ml64 don't handle SSE>2 instructions
+my %regrm = (	"%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3,
+		"%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7	);
+
+my $movq = sub {	# elderly gas can't handle inter-register movq
+  my $arg = shift;
+  my @opcode=(0x66);
+    if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) {
+	my ($src,$dst)=($1,$2);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x7e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) {
+	my ($src,$dst)=($2,$1);
+	if ($dst !~ /[0-9]+/)	{ $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,$src,$dst,0x8);
+	push @opcode,0x0f,0x6e;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pextrd = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($dst =~ /%r([0-9]+)d/)	{ $dst = $1; }
+	elsif ($dst =~ /%e/)		{ $dst = $regrm{$dst}; }
+	rex(\@opcode,$src,$dst);
+	push @opcode,0x0f,0x3a,0x16;
+	push @opcode,0xc0|(($src&7)<<3)|($dst&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pinsrd = sub {
+    if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	$imm=$1;
+	$src=$2;
+	$dst=$3;
+	if ($src =~ /%r([0-9]+)/)	{ $src = $1; }
+	elsif ($src =~ /%e/)		{ $src = $regrm{$src}; }
+	rex(\@opcode,$dst,$src);
+	push @opcode,0x0f,0x3a,0x22;
+	push @opcode,0xc0|(($dst&7)<<3)|($src&7);	# ModR/M
+	push @opcode,$imm;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pshufb = sub {
+    if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$2,$1);
+	push @opcode,0x0f,0x38,0x00;
+	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $palignr = sub {
+    if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x0f;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	push @opcode,$1;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $pclmulqdq = sub {
+    if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
+      my @opcode=(0x66);
+	rex(\@opcode,$3,$2);
+	push @opcode,0x0f,0x3a,0x44;
+	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
+	my $c=$1;
+	push @opcode,$c=~/^0/?oct($c):$c;
+	@opcode;
+    } else {
+	();
+    }
+};
+
+my $rdrand = sub {
+    if (shift =~ /%[er](\w+)/) {
+      my @opcode=();
+      my $dst=$1;
+	if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; }
+	rex(\@opcode,0,$1,8);
+	push @opcode,0x0f,0xc7,0xf0|($dst&7);
+	@opcode;
+    } else {
+	();
+    }
+};
+
 if ($nasm) {
     print <<___;
 default	rel
+%define XMMWORD
 ___
 } elsif ($masm) {
     print <<___;
@@ -644,14 +792,22 @@ while($line=<>) {
 
     undef $label;
     undef $opcode;
-    undef $sz;
     undef @args;
 
     if ($label=label->re(\$line))	{ print $label->out(); }
 
     if (directive->re(\$line)) {
 	printf "%s",directive->out();
-    } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) {
+    } elsif ($opcode=opcode->re(\$line)) {
+	my $asm = eval("\$".$opcode->mnemonic());
+	undef @bytes;
+	
+	if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) {
+	    print $gas?".byte\t":"DB\t",join(',',@bytes),"\n";
+	    next;
+	}
+
+	ARGUMENT: while (1) {
 	my $arg;
 
 	if ($arg=register->re(\$line))	{ opcode->size($arg->size()); }
@@ -667,19 +823,26 @@ while($line=<>) {
 	$line =~ s/^,\s*//;
 	} # ARGUMENT:
 
-	$sz=opcode->size();
-
 	if ($#args>=0) {
 	    my $insn;
+	    my $sz=opcode->size();
+
 	    if ($gas) {
 		$insn = $opcode->out($#args>=1?$args[$#args]->size():$sz);
+		@args = map($_->out($sz),@args);
+		printf "\t%s\t%s",$insn,join(",",@args);
 	    } else {
 		$insn = $opcode->out();
-		$insn .= $sz if (map($_->out() =~ /x?mm/,@args));
+		foreach (@args) {
+		    my $arg = $_->out();
+		    # $insn.=$sz compensates for movq, pinsrw, ...
+		    if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; }
+		    if ($arg =~ /^mm[0-9]+$/)  { $insn.=$sz; $sz="q" if(!$sz); last; }
+		}
 		@args = reverse(@args);
 		undef $sz if ($nasm && $opcode->mnemonic() eq "lea");
+		printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	    }
-	    printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args));
 	} else {
 	    printf "\t%s",$opcode->out();
 	}
diff --git a/src/lib/libcrypto/perlasm/x86asm.pl b/src/lib/libcrypto/perlasm/x86asm.pl
index 28080caaa6..eb543db2f6 100644
--- a/src/lib/libcrypto/perlasm/x86asm.pl
+++ b/src/lib/libcrypto/perlasm/x86asm.pl
@@ -80,6 +80,57 @@ sub ::movq
     {	&::generic("movq",@_);			}
 }
 
+# SSE>2 instructions
+my %regrm = (	"eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3,
+		"esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7	);
+sub ::pextrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm);	}
+    else
+    {	&::generic("pextrd",@_);		}
+}
+
+sub ::pinsrd
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm);	}
+    else
+    {	&::generic("pinsrd",@_);		}
+}
+
+sub ::pshufb
+{ my($dst,$src)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2);	}
+    else
+    {	&::generic("pshufb",@_);		}
+}
+
+sub ::palignr
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("palignr",@_);		}
+}
+
+sub ::pclmulqdq
+{ my($dst,$src,$imm)=@_;
+    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
+    {	&::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm);	}
+    else
+    {	&::generic("pclmulqdq",@_);		}
+}
+
+sub ::rdrand
+{ my ($dst)=@_;
+    if ($dst =~ /(e[a-dsd][ixp])/)
+    {	&::data_byte(0x0f,0xc7,0xf0|$regrm{$dst});	}
+    else
+    {	&::generic("rdrand",@_);	}
+}
+
 # label management
 $lbdecor="L";		# local label decoration, set by package
 $label="000";
@@ -167,7 +218,7 @@ sub ::asm_init
     $filename=$fn;
     $i386=$cpu;
 
-    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=0;
+    $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0;
     if    (($type eq "elf"))
     {	$elf=1;			require "x86gas.pl";	}
     elsif (($type eq "a\.out"))
@@ -184,6 +235,8 @@ sub ::asm_init
     {	$win32=1;		require "x86masm.pl";	}
     elsif (($type eq "macosx"))
     {	$aout=1; $macosx=1;	require "x86gas.pl";	}
+    elsif (($type eq "android"))
+    {	$elf=1; $android=1;	require "x86gas.pl";	}
     else
     {	print STDERR <<"EOF";
 Pick one target type from
diff --git a/src/lib/libcrypto/perlasm/x86gas.pl b/src/lib/libcrypto/perlasm/x86gas.pl
index 6eab727fd4..682a3a3163 100644
--- a/src/lib/libcrypto/perlasm/x86gas.pl
+++ b/src/lib/libcrypto/perlasm/x86gas.pl
@@ -45,9 +45,8 @@ sub ::generic
     undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o);
 
     if ($#_==0)				{ &::emit($opcode);		}
-    elsif ($opcode =~ m/^j/o && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode eq "call" && $#_==1)	{ &::emit($opcode,@arg);	}
-    elsif ($opcode =~ m/^set/&& $#_==1)	{ &::emit($opcode,@arg);	}
+    elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o)
+					{ &::emit($opcode,@arg);	}
     else				{ &::emit($opcode.$suffix,@arg);}
 
   1;
@@ -91,6 +90,7 @@ sub ::DWP
 }
 sub ::QWP	{ &::DWP(@_);	}
 sub ::BP	{ &::DWP(@_);	}
+sub ::WP	{ &::DWP(@_);	}
 sub ::BC	{ @_;		}
 sub ::DWC	{ @_;		}
 
@@ -149,22 +149,24 @@ sub ::public_label
 {   push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n");   }
 
 sub ::file_end
-{   if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
-	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,4";
-	if ($::elf)	{ push (@out,"$tmp,4\n"); }
-	else		{ push (@out,"$tmp\n"); }
-    }
-    if ($::macosx)
+{   if ($::macosx)
     {	if (%non_lazy_ptr)
     	{   push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n");
 	    foreach $i (keys %non_lazy_ptr)
 	    {	push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n");   }
 	}
     }
+    if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) {
+	my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8";
+	if ($::macosx)	{ push (@out,"$tmp,2\n"); }
+	elsif ($::elf)	{ push (@out,"$tmp,4\n"); }
+	else		{ push (@out,"$tmp\n"); }
+    }
     push(@out,$initseg) if ($initseg);
 }
 
 sub ::data_byte	{   push(@out,".byte\t".join(',',@_)."\n");   }
+sub ::data_short{   push(@out,".value\t".join(',',@_)."\n");  }
 sub ::data_word {   push(@out,".long\t".join(',',@_)."\n");   }
 
 sub ::align
@@ -180,7 +182,7 @@ sub ::align
 sub ::picmeup
 { my($dst,$sym,$base,$reflabel)=@_;
 
-    if ($::pic && ($::elf || $::aout))
+    if (($::pic && ($::elf || $::aout)) || $::macosx)
     {	if (!defined($base))
 	{   &::call(&::label("PIC_me_up"));
 	    &::set_label("PIC_me_up");
@@ -206,13 +208,17 @@ sub ::picmeup
 sub ::initseg
 { my $f=$nmdecor.shift;
 
-    if ($::elf)
+    if ($::android)
+    {	$initseg.=<<___;
+.section	.init_array
+.align	4
+.long	$f
+___
+    }
+    elsif ($::elf)
     {	$initseg.=<<___;
 .section	.init
 	call	$f
-	jmp	.Linitalign
-.align	$align
-.Linitalign:
 ___
     }
     elsif ($::coff)
diff --git a/src/lib/libcrypto/pkcs12/p12_decr.c b/src/lib/libcrypto/pkcs12/p12_decr.c
index ba77dbbe32..9d3557e8d7 100644
--- a/src/lib/libcrypto/pkcs12/p12_decr.c
+++ b/src/lib/libcrypto/pkcs12/p12_decr.c
@@ -89,7 +89,14 @@ unsigned char * PKCS12_pbe_crypt(X509_ALGOR *algor, const char *pass,
 		goto err;
 	}
 
-	EVP_CipherUpdate(&ctx, out, &i, in, inlen);
+	if (!EVP_CipherUpdate(&ctx, out, &i, in, inlen))
+		{
+		OPENSSL_free(out);
+		out = NULL;
+		PKCS12err(PKCS12_F_PKCS12_PBE_CRYPT,ERR_R_EVP_LIB);
+		goto err;
+		}
+
 	outlen = i;
 	if(!EVP_CipherFinal_ex(&ctx, out + i, &i)) {
 		OPENSSL_free(out);
diff --git a/src/lib/libcrypto/pkcs12/p12_key.c b/src/lib/libcrypto/pkcs12/p12_key.c
index 424203f648..c55c7b60b3 100644
--- a/src/lib/libcrypto/pkcs12/p12_key.c
+++ b/src/lib/libcrypto/pkcs12/p12_key.c
@@ -152,14 +152,16 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt,
 	for (i = 0; i < Slen; i++) *p++ = salt[i % saltlen];
 	for (i = 0; i < Plen; i++) *p++ = pass[i % passlen];
 	for (;;) {
-		EVP_DigestInit_ex(&ctx, md_type, NULL);
-		EVP_DigestUpdate(&ctx, D, v);
-		EVP_DigestUpdate(&ctx, I, Ilen);
-		EVP_DigestFinal_ex(&ctx, Ai, NULL);
+		if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+			|| !EVP_DigestUpdate(&ctx, D, v)
+			|| !EVP_DigestUpdate(&ctx, I, Ilen)
+			|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		for (j = 1; j < iter; j++) {
-			EVP_DigestInit_ex(&ctx, md_type, NULL);
-			EVP_DigestUpdate(&ctx, Ai, u);
-			EVP_DigestFinal_ex(&ctx, Ai, NULL);
+			if (!EVP_DigestInit_ex(&ctx, md_type, NULL)
+				|| !EVP_DigestUpdate(&ctx, Ai, u)
+				|| !EVP_DigestFinal_ex(&ctx, Ai, NULL))
+			goto err;
 		}
 		memcpy (out, Ai, min (n, u));
 		if (u >= n) {
diff --git a/src/lib/libcrypto/pkcs12/p12_kiss.c b/src/lib/libcrypto/pkcs12/p12_kiss.c
index 292cc3ed4a..206b1b0b18 100644
--- a/src/lib/libcrypto/pkcs12/p12_kiss.c
+++ b/src/lib/libcrypto/pkcs12/p12_kiss.c
@@ -167,7 +167,7 @@ int PKCS12_parse(PKCS12 *p12, const char *pass, EVP_PKEY **pkey, X509 **cert,
 	if (cert && *cert)
 		X509_free(*cert);
 	if (x)
-		X509_free(*cert);
+		X509_free(x);
 	if (ocerts)
 		sk_X509_pop_free(ocerts, X509_free);
 	return 0;
diff --git a/src/lib/libcrypto/pkcs12/p12_mutl.c b/src/lib/libcrypto/pkcs12/p12_mutl.c
index 9ab740d51f..96de1bd11e 100644
--- a/src/lib/libcrypto/pkcs12/p12_mutl.c
+++ b/src/lib/libcrypto/pkcs12/p12_mutl.c
@@ -97,10 +97,14 @@ int PKCS12_gen_mac(PKCS12 *p12, const char *pass, int passlen,
 		return 0;
 	}
 	HMAC_CTX_init(&hmac);
-	HMAC_Init_ex(&hmac, key, md_size, md_type, NULL);
-    	HMAC_Update(&hmac, p12->authsafes->d.data->data,
-					 p12->authsafes->d.data->length);
-    	HMAC_Final(&hmac, mac, maclen);
+	if (!HMAC_Init_ex(&hmac, key, md_size, md_type, NULL)
+    		|| !HMAC_Update(&hmac, p12->authsafes->d.data->data,
+					 p12->authsafes->d.data->length)
+    		|| !HMAC_Final(&hmac, mac, maclen))
+		{
+    		HMAC_CTX_cleanup(&hmac);
+		return 0;
+		}
     	HMAC_CTX_cleanup(&hmac);
 	return 1;
 }
diff --git a/src/lib/libcrypto/pkcs7/pk7_doit.c b/src/lib/libcrypto/pkcs7/pk7_doit.c
index 3bf1a367bb..77fda3b82a 100644
--- a/src/lib/libcrypto/pkcs7/pk7_doit.c
+++ b/src/lib/libcrypto/pkcs7/pk7_doit.c
@@ -204,11 +204,11 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
 	unsigned char *ek = NULL;
 	size_t eklen;
 
-	int ret = 0;
+	int ret = -1;
 
 	pctx = EVP_PKEY_CTX_new(pkey, NULL);
 	if (!pctx)
-		return 0;
+		return -1;
 
 	if (EVP_PKEY_decrypt_init(pctx) <= 0)
 		goto err;
@@ -235,12 +235,19 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen,
 	if (EVP_PKEY_decrypt(pctx, ek, &eklen,
 				ri->enc_key->data, ri->enc_key->length) <= 0)
 		{
+		ret = 0;
 		PKCS7err(PKCS7_F_PKCS7_DECRYPT_RINFO, ERR_R_EVP_LIB);
 		goto err;
 		}
 
 	ret = 1;
 
+	if (*pek)
+		{
+		OPENSSL_cleanse(*pek, *peklen);
+		OPENSSL_free(*pek);
+		}
+
 	*pek = ek;
 	*peklen = eklen;
 
@@ -423,6 +430,8 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 	STACK_OF(X509_ALGOR) *md_sk=NULL;
 	STACK_OF(PKCS7_RECIP_INFO) *rsk=NULL;
 	PKCS7_RECIP_INFO *ri=NULL;
+       unsigned char *ek = NULL, *tkey = NULL;
+       int eklen = 0, tkeylen = 0;
 
 	i=OBJ_obj2nid(p7->type);
 	p7->state=PKCS7_S_HEADER;
@@ -500,8 +509,6 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 		int max;
 		X509_OBJECT ret;
 #endif
-		unsigned char *ek = NULL;
-		int eklen;
 
 		if ((etmp=BIO_new(BIO_f_cipher())) == NULL)
 			{
@@ -534,29 +541,28 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			}
 
 		/* If we haven't got a certificate try each ri in turn */
-
 		if (pcert == NULL)
 			{
+			/* Always attempt to decrypt all rinfo even
+			 * after sucess as a defence against MMA timing
+			 * attacks.
+			 */
 			for (i=0; i<sk_PKCS7_RECIP_INFO_num(rsk); i++)
 				{
 				ri=sk_PKCS7_RECIP_INFO_value(rsk,i);
+				
 				if (pkcs7_decrypt_rinfo(&ek, &eklen,
-							ri, pkey) > 0)
-					break;
+							ri, pkey) < 0)
+					goto err;
 				ERR_clear_error();
-				ri = NULL;
-				}
-			if (ri == NULL)
-				{
-				PKCS7err(PKCS7_F_PKCS7_DATADECODE,
-				      PKCS7_R_NO_RECIPIENT_MATCHES_KEY);
-				goto err;
 				}
 			}
 		else
 			{
-			if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) <= 0)
+			/* Only exit on fatal errors, not decrypt failure */
+			if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) < 0)
 				goto err;
+			ERR_clear_error();
 			}
 
 		evp_ctx=NULL;
@@ -565,6 +571,19 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			goto err;
 		if (EVP_CIPHER_asn1_to_param(evp_ctx,enc_alg->parameter) < 0)
 			goto err;
+		/* Generate random key as MMA defence */
+		tkeylen = EVP_CIPHER_CTX_key_length(evp_ctx);
+		tkey = OPENSSL_malloc(tkeylen);
+		if (!tkey)
+			goto err;
+		if (EVP_CIPHER_CTX_rand_key(evp_ctx, tkey) <= 0)
+			goto err;
+		if (ek == NULL)
+			{
+			ek = tkey;
+			eklen = tkeylen;
+			tkey = NULL;
+			}
 
 		if (eklen != EVP_CIPHER_CTX_key_length(evp_ctx)) {
 			/* Some S/MIME clients don't use the same key
@@ -573,11 +592,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			 */
 			if(!EVP_CIPHER_CTX_set_key_length(evp_ctx, eklen))
 				{
-				PKCS7err(PKCS7_F_PKCS7_DATADECODE,
-					PKCS7_R_DECRYPTED_KEY_IS_WRONG_LENGTH);
-				goto err;
+				/* Use random key as MMA defence */
+				OPENSSL_cleanse(ek, eklen);
+				OPENSSL_free(ek);
+				ek = tkey;
+				eklen = tkeylen;
+				tkey = NULL;
 				}
 		} 
+		/* Clear errors so we don't leak information useful in MMA */
+		ERR_clear_error();
 		if (EVP_CipherInit_ex(evp_ctx,NULL,NULL,ek,NULL,0) <= 0)
 			goto err;
 
@@ -585,6 +609,13 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 			{
 			OPENSSL_cleanse(ek,eklen);
 			OPENSSL_free(ek);
+                       ek = NULL;
+			}
+		if (tkey)
+			{
+			OPENSSL_cleanse(tkey,tkeylen);
+			OPENSSL_free(tkey);
+                       tkey = NULL;
 			}
 
 		if (out == NULL)
@@ -627,6 +658,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert)
 	if (0)
 		{
 err:
+               if (ek)
+                       {
+                       OPENSSL_cleanse(ek,eklen);
+                       OPENSSL_free(ek);
+                       }
+               if (tkey)
+                       {
+                       OPENSSL_cleanse(tkey,tkeylen);
+                       OPENSSL_free(tkey);
+                       }
 		if (out != NULL) BIO_free_all(out);
 		if (btmp != NULL) BIO_free_all(btmp);
 		if (etmp != NULL) BIO_free_all(etmp);
@@ -676,7 +717,11 @@ static int do_pkcs7_signed_attrib(PKCS7_SIGNER_INFO *si, EVP_MD_CTX *mctx)
 		}
 
 	/* Add digest */
-	EVP_DigestFinal_ex(mctx, md_data,&md_len);
+	if (!EVP_DigestFinal_ex(mctx, md_data,&md_len))
+		{
+		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_EVP_LIB);
+		return 0;
+		}
 	if (!PKCS7_add1_attrib_digest(si, md_data, md_len))
 		{
 		PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_MALLOC_FAILURE);
@@ -784,7 +829,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
 
 			/* We now have the EVP_MD_CTX, lets do the
 			 * signing. */
-			EVP_MD_CTX_copy_ex(&ctx_tmp,mdc);
+			if (!EVP_MD_CTX_copy_ex(&ctx_tmp,mdc))
+				goto err;
 
 			sk=si->auth_attr;
 
@@ -822,7 +868,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio)
 		if (!PKCS7_find_digest(&mdc, bio,
 				OBJ_obj2nid(p7->d.digest->md->algorithm)))
 			goto err;
-		EVP_DigestFinal_ex(mdc,md_data,&md_len);
+		if (!EVP_DigestFinal_ex(mdc,md_data,&md_len))
+			goto err;
 		M_ASN1_OCTET_STRING_set(p7->d.digest->digest, md_data, md_len);
 		}
 
@@ -1015,7 +1062,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
 
 	/* mdc is the digest ctx that we want, unless there are attributes,
 	 * in which case the digest is the signed attributes */
-	EVP_MD_CTX_copy_ex(&mdc_tmp,mdc);
+	if (!EVP_MD_CTX_copy_ex(&mdc_tmp,mdc))
+		goto err;
 
 	sk=si->auth_attr;
 	if ((sk != NULL) && (sk_X509_ATTRIBUTE_num(sk) != 0))
@@ -1025,7 +1073,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si,
 		int alen;
 		ASN1_OCTET_STRING *message_digest;
 
-		EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len);
+		if (!EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len))
+			goto err;
 		message_digest=PKCS7_digest_from_attributes(sk);
 		if (!message_digest)
 			{
@@ -1050,7 +1099,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
 			goto err;
 			}
 
-		EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL);
+		if (!EVP_VerifyInit_ex(&mdc_tmp,EVP_get_digestbynid(md_type), NULL))
+			goto err;
 
 		alen = ASN1_item_i2d((ASN1_VALUE *)sk, &abuf,
 						ASN1_ITEM_rptr(PKCS7_ATTR_VERIFY));
@@ -1060,7 +1110,8 @@ for (ii=0; ii<md_len; ii++) printf("%02X",md_dat[ii]); printf(" calc\n");
 			ret = -1;
 			goto err;
 			}
-		EVP_VerifyUpdate(&mdc_tmp, abuf, alen);
+		if (!EVP_VerifyUpdate(&mdc_tmp, abuf, alen))
+			goto err;
 
 		OPENSSL_free(abuf);
 		}
diff --git a/src/lib/libcrypto/pkcs7/pk7_smime.c b/src/lib/libcrypto/pkcs7/pk7_smime.c
index 86742d0dcd..a5104f8d05 100644
--- a/src/lib/libcrypto/pkcs7/pk7_smime.c
+++ b/src/lib/libcrypto/pkcs7/pk7_smime.c
@@ -573,15 +573,34 @@ int PKCS7_decrypt(PKCS7 *p7, EVP_PKEY *pkey, X509 *cert, BIO *data, int flags)
 			return 0;
 		}
 		ret = SMIME_text(bread, data);
+		if (ret > 0 && BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+			{
+			if (!BIO_get_cipher_status(tmpmem))
+				ret = 0;
+			}
 		BIO_free_all(bread);
 		return ret;
 	} else {
 		for(;;) {
 			i = BIO_read(tmpmem, buf, sizeof(buf));
-			if(i <= 0) break;
-			BIO_write(data, buf, i);
+			if(i <= 0)
+				{
+				ret = 1;
+				if (BIO_method_type(tmpmem) == BIO_TYPE_CIPHER)
+					{
+					if (!BIO_get_cipher_status(tmpmem))
+						ret = 0;
+					}
+					
+				break;
+				}
+			if (BIO_write(data, buf, i) != i)
+				{
+				ret = 0;
+				break;
+				}
 		}
 		BIO_free_all(tmpmem);
-		return 1;
+		return ret;
 	}
 }
diff --git a/src/lib/libcrypto/ppccap.c b/src/lib/libcrypto/ppccap.c
new file mode 100644
index 0000000000..ab89ccaa12
--- /dev/null
+++ b/src/lib/libcrypto/ppccap.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <crypto.h>
+#include <openssl/bn.h>
+
+#define PPC_FPU64	(1<<0)
+#define PPC_ALTIVEC	(1<<1)
+
+static int OPENSSL_ppccap_P = 0;
+
+static sigset_t all_masked;
+
+#ifdef OPENSSL_BN_ASM_MONT
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
+	{
+	int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
+
+	if (sizeof(size_t)==4)
+		{
+#if (defined(__APPLE__) && defined(__MACH__))
+		if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
+			return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+#else
+		/* boundary of 32 was experimentally determined on
+		   Linux 2.6.22, might have to be adjusted on AIX... */
+		if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
+			{
+			sigset_t oset;
+			int ret;
+
+			sigprocmask(SIG_SETMASK,&all_masked,&oset);
+			ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+			sigprocmask(SIG_SETMASK,&oset,NULL);
+
+			return ret;
+			}
+#endif
+		}
+	else if ((OPENSSL_ppccap_P&PPC_FPU64))
+		/* this is a "must" on POWER6, but run-time detection
+		 * is not implemented yet... */
+		return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
+
+	return bn_mul_mont_int(rp,ap,bp,np,n0,num);
+	}
+#endif
+
+static sigjmp_buf ill_jmp;
+static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
+
+void OPENSSL_ppc64_probe(void);
+
+void OPENSSL_cpuid_setup(void)
+	{
+	char *e;
+	struct sigaction	ill_oact,ill_act;
+	sigset_t		oset;
+	static int trigger=0;
+
+	if (trigger) return;
+	trigger=1;
+ 
+	sigfillset(&all_masked);
+	sigdelset(&all_masked,SIGILL);
+	sigdelset(&all_masked,SIGTRAP);
+#ifdef SIGEMT
+	sigdelset(&all_masked,SIGEMT);
+#endif
+	sigdelset(&all_masked,SIGFPE);
+	sigdelset(&all_masked,SIGBUS);
+	sigdelset(&all_masked,SIGSEGV);
+
+	if ((e=getenv("OPENSSL_ppccap")))
+		{
+		OPENSSL_ppccap_P=strtoul(e,NULL,0);
+		return;
+		}
+
+	OPENSSL_ppccap_P = 0;
+
+	memset(&ill_act,0,sizeof(ill_act));
+	ill_act.sa_handler = ill_handler;
+	ill_act.sa_mask    = all_masked;
+
+	sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
+	sigaction(SIGILL,&ill_act,&ill_oact);
+
+	if (sizeof(size_t)==4)
+		{
+		if (sigsetjmp(ill_jmp,1) == 0)
+			{
+			OPENSSL_ppc64_probe();
+			OPENSSL_ppccap_P |= PPC_FPU64;
+			}
+		}
+	else
+		{
+		/*
+		 * Wanted code detecting POWER6 CPU and setting PPC_FPU64
+		 */
+		}
+
+	if (sigsetjmp(ill_jmp,1) == 0)
+		{
+		OPENSSL_altivec_probe();
+		OPENSSL_ppccap_P |= PPC_ALTIVEC;
+		}
+
+	sigaction (SIGILL,&ill_oact,NULL);
+	sigprocmask(SIG_SETMASK,&oset,NULL);
+	}
diff --git a/src/lib/libcrypto/ppccpuid.pl b/src/lib/libcrypto/ppccpuid.pl
index 369e1d0df9..4ba736a1d1 100755
--- a/src/lib/libcrypto/ppccpuid.pl
+++ b/src/lib/libcrypto/ppccpuid.pl
@@ -23,36 +23,67 @@ $code=<<___;
 .machine	"any"
 .text
 
-.globl	.OPENSSL_cpuid_setup
+.globl	.OPENSSL_ppc64_probe
 .align	4
-.OPENSSL_cpuid_setup:
+.OPENSSL_ppc64_probe:
+	fcfid	f1,f1
+	extrdi	r0,r0,32,0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+
+.globl	.OPENSSL_altivec_probe
+.align	4
+.OPENSSL_altivec_probe:
+	.long	0x10000484	# vor	v0,v0,v0
+	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_wipe_cpu
 .align	4
 .OPENSSL_wipe_cpu:
 	xor	r0,r0,r0
+	fmr	f0,f31
+	fmr	f1,f31
+	fmr	f2,f31
 	mr	r3,r1
+	fmr	f3,f31
 	xor	r4,r4,r4
+	fmr	f4,f31
 	xor	r5,r5,r5
+	fmr	f5,f31
 	xor	r6,r6,r6
+	fmr	f6,f31
 	xor	r7,r7,r7
+	fmr	f7,f31
 	xor	r8,r8,r8
+	fmr	f8,f31
 	xor	r9,r9,r9
+	fmr	f9,f31
 	xor	r10,r10,r10
+	fmr	f10,f31
 	xor	r11,r11,r11
+	fmr	f11,f31
 	xor	r12,r12,r12
+	fmr	f12,f31
+	fmr	f13,f31
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_atomic_add
 .align	4
 .OPENSSL_atomic_add:
-Loop:	lwarx	r5,0,r3
+Ladd:	lwarx	r5,0,r3
 	add	r0,r4,r5
 	stwcx.	r0,0,r3
-	bne-	Loop
+	bne-	Ladd
 	$SIGNX	r3,r0
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 
 .globl	.OPENSSL_rdtsc
 .align	4
@@ -60,6 +91,8 @@ Loop:	lwarx	r5,0,r3
 	mftb	r3
 	mftbu	r4
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 
 .globl	.OPENSSL_cleanse
 .align	4
@@ -72,7 +105,7 @@ Loop:	lwarx	r5,0,r3
 Little:	mtctr	r4
 	stb	r0,0(r3)
 	addi	r3,r3,1
-	bdnz-	\$-8
+	bdnz	\$-8
 	blr
 Lot:	andi.	r5,r3,3
 	beq	Laligned
@@ -85,10 +118,13 @@ Laligned:
 	mtctr	r5
 	stw	r0,0(r3)
 	addi	r3,r3,4
-	bdnz-	\$-8
+	bdnz	\$-8
 	andi.	r4,r4,3
 	bne	Little
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,2,0
+	.long	0
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/rand/rand.h b/src/lib/libcrypto/rand/rand.h
index ac6c021763..dc8fcf94c5 100644
--- a/src/lib/libcrypto/rand/rand.h
+++ b/src/lib/libcrypto/rand/rand.h
@@ -119,6 +119,11 @@ int RAND_event(UINT, WPARAM, LPARAM);
 
 #endif
 
+#ifdef OPENSSL_FIPS
+void RAND_set_fips_drbg_type(int type, int flags);
+int RAND_init_fips(void);
+#endif
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -129,9 +134,13 @@ void ERR_load_RAND_strings(void);
 
 /* Function codes. */
 #define RAND_F_RAND_GET_RAND_METHOD			 101
+#define RAND_F_RAND_INIT_FIPS				 102
 #define RAND_F_SSLEAY_RAND_BYTES			 100
 
 /* Reason codes. */
+#define RAND_R_ERROR_INITIALISING_DRBG			 102
+#define RAND_R_ERROR_INSTANTIATING_DRBG			 103
+#define RAND_R_NO_FIPS_RANDOM_METHOD_SET		 101
 #define RAND_R_PRNG_NOT_SEEDED				 100
 
 #ifdef  __cplusplus
diff --git a/src/lib/libcrypto/rand/rand_err.c b/src/lib/libcrypto/rand/rand_err.c
index 03cda4dd92..b8586c8f4a 100644
--- a/src/lib/libcrypto/rand/rand_err.c
+++ b/src/lib/libcrypto/rand/rand_err.c
@@ -1,6 +1,6 @@
 /* crypto/rand/rand_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2006 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -71,12 +71,16 @@
 static ERR_STRING_DATA RAND_str_functs[]=
 	{
 {ERR_FUNC(RAND_F_RAND_GET_RAND_METHOD),	"RAND_get_rand_method"},
+{ERR_FUNC(RAND_F_RAND_INIT_FIPS),	"RAND_init_fips"},
 {ERR_FUNC(RAND_F_SSLEAY_RAND_BYTES),	"SSLEAY_RAND_BYTES"},
 {0,NULL}
 	};
 
 static ERR_STRING_DATA RAND_str_reasons[]=
 	{
+{ERR_REASON(RAND_R_ERROR_INITIALISING_DRBG),"error initialising drbg"},
+{ERR_REASON(RAND_R_ERROR_INSTANTIATING_DRBG),"error instantiating drbg"},
+{ERR_REASON(RAND_R_NO_FIPS_RANDOM_METHOD_SET),"no fips random method set"},
 {ERR_REASON(RAND_R_PRNG_NOT_SEEDED)      ,"PRNG not seeded"},
 {0,NULL}
 	};
diff --git a/src/lib/libcrypto/rand/rand_lib.c b/src/lib/libcrypto/rand/rand_lib.c
index 513e338985..daf1dab973 100644
--- a/src/lib/libcrypto/rand/rand_lib.c
+++ b/src/lib/libcrypto/rand/rand_lib.c
@@ -60,10 +60,16 @@
 #include <time.h>
 #include "cryptlib.h"
 #include <openssl/rand.h>
+
 #ifndef OPENSSL_NO_ENGINE
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#include <openssl/fips_rand.h>
+#endif
+
 #ifndef OPENSSL_NO_ENGINE
 /* non-NULL if default_RAND_meth is ENGINE-provided */
 static ENGINE *funct_ref =NULL;
@@ -174,3 +180,116 @@ int RAND_status(void)
 		return meth->status();
 	return 0;
 	}
+
+#ifdef OPENSSL_FIPS
+
+/* FIPS DRBG initialisation code. This sets up the DRBG for use by the
+ * rest of OpenSSL. 
+ */
+
+/* Entropy gatherer: use standard OpenSSL PRNG to seed (this will gather
+ * entropy internally through RAND_poll().
+ */
+
+static size_t drbg_get_entropy(DRBG_CTX *ctx, unsigned char **pout,
+                                int entropy, size_t min_len, size_t max_len)
+        {
+	/* Round up request to multiple of block size */
+	min_len = ((min_len + 19) / 20) * 20;
+	*pout = OPENSSL_malloc(min_len);
+	if (!*pout)
+		return 0;
+	if (RAND_SSLeay()->bytes(*pout, min_len) <= 0)
+		{
+		OPENSSL_free(*pout);
+		*pout = NULL;
+		return 0;
+		}
+        return min_len;
+        }
+
+static void drbg_free_entropy(DRBG_CTX *ctx, unsigned char *out, size_t olen)
+	{
+	OPENSSL_cleanse(out, olen);
+	OPENSSL_free(out);
+	}
+
+/* Set "additional input" when generating random data. This uses the
+ * current PID, a time value and a counter.
+ */
+
+static size_t drbg_get_adin(DRBG_CTX *ctx, unsigned char **pout)
+    	{
+	/* Use of static variables is OK as this happens under a lock */
+	static unsigned char buf[16];
+	static unsigned long counter;
+	FIPS_get_timevec(buf, &counter);
+	*pout = buf;
+	return sizeof(buf);
+	}
+
+/* RAND_add() and RAND_seed() pass through to OpenSSL PRNG so it is 
+ * correctly seeded by RAND_poll().
+ */
+
+static int drbg_rand_add(DRBG_CTX *ctx, const void *in, int inlen,
+				double entropy)
+	{
+	RAND_SSLeay()->add(in, inlen, entropy);
+	return 1;
+	}
+
+static int drbg_rand_seed(DRBG_CTX *ctx, const void *in, int inlen)
+	{
+	RAND_SSLeay()->seed(in, inlen);
+	return 1;
+	}
+
+#ifndef OPENSSL_DRBG_DEFAULT_TYPE
+#define OPENSSL_DRBG_DEFAULT_TYPE	NID_aes_256_ctr
+#endif
+#ifndef OPENSSL_DRBG_DEFAULT_FLAGS
+#define OPENSSL_DRBG_DEFAULT_FLAGS	DRBG_FLAG_CTR_USE_DF
+#endif 
+
+static int fips_drbg_type = OPENSSL_DRBG_DEFAULT_TYPE;
+static int fips_drbg_flags = OPENSSL_DRBG_DEFAULT_FLAGS;
+
+void RAND_set_fips_drbg_type(int type, int flags)
+	{
+	fips_drbg_type = type;
+	fips_drbg_flags = flags;
+	}
+
+int RAND_init_fips(void)
+	{
+	DRBG_CTX *dctx;
+	size_t plen;
+	unsigned char pers[32], *p;
+	dctx = FIPS_get_default_drbg();
+        if (FIPS_drbg_init(dctx, fips_drbg_type, fips_drbg_flags) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INITIALISING_DRBG);
+		return 0;
+		}
+		
+        FIPS_drbg_set_callbacks(dctx,
+				drbg_get_entropy, drbg_free_entropy, 20,
+				drbg_get_entropy, drbg_free_entropy);
+	FIPS_drbg_set_rand_callbacks(dctx, drbg_get_adin, 0,
+					drbg_rand_seed, drbg_rand_add);
+	/* Personalisation string: a string followed by date time vector */
+	strcpy((char *)pers, "OpenSSL DRBG2.0");
+	plen = drbg_get_adin(dctx, &p);
+	memcpy(pers + 16, p, plen);
+
+        if (FIPS_drbg_instantiate(dctx, pers, sizeof(pers)) <= 0)
+		{
+		RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INSTANTIATING_DRBG);
+		return 0;
+		}
+        FIPS_rand_set_method(FIPS_drbg_method());
+	return 1;
+	}
+
+#endif
diff --git a/src/lib/libcrypto/rand/randfile.c b/src/lib/libcrypto/rand/randfile.c
index bc7d9c5804..030e07f418 100644
--- a/src/lib/libcrypto/rand/randfile.c
+++ b/src/lib/libcrypto/rand/randfile.c
@@ -137,7 +137,7 @@ int RAND_load_file(const char *file, long bytes)
 	in=fopen(file,"rb");
 #endif
 	if (in == NULL) goto err;
-#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPNESSL_NO_POSIX_IO)
+#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPENSSL_NO_POSIX_IO)
 	if (sb.st_mode & (S_IFBLK | S_IFCHR)) {
 	  /* this file is a device. we don't want read an infinite number
 	   * of bytes from a random device, nor do we want to use buffered
diff --git a/src/lib/libcrypto/rc2/rc2.h b/src/lib/libcrypto/rc2/rc2.h
index 34c8362317..e542ec94ff 100644
--- a/src/lib/libcrypto/rc2/rc2.h
+++ b/src/lib/libcrypto/rc2/rc2.h
@@ -79,7 +79,9 @@ typedef struct rc2_key_st
 	RC2_INT data[64];
 	} RC2_KEY;
 
- 
+#ifdef OPENSSL_FIPS 
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
+#endif
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits);
 void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key,
 		     int enc);
diff --git a/src/lib/libcrypto/rc2/rc2_skey.c b/src/lib/libcrypto/rc2/rc2_skey.c
index 0150b0e035..6668ac011f 100644
--- a/src/lib/libcrypto/rc2/rc2_skey.c
+++ b/src/lib/libcrypto/rc2/rc2_skey.c
@@ -56,6 +56,7 @@
  * [including the GNU Public Licence.]
  */
 
+#include <openssl/crypto.h>
 #include <openssl/rc2.h>
 #include "rc2_locl.h"
 
@@ -95,6 +96,13 @@ static const unsigned char key_table[256]={
  * the same as specifying 1024 for the 'bits' parameter.  Bsafe uses
  * a version where the bits parameter is the same as len*8 */
 void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#ifdef OPENSSL_FIPS
+	{
+	fips_cipher_abort(RC2);
+	private_RC2_set_key(key, len, data, bits);
+	}
+void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits)
+#endif
 	{
 	int i,j;
 	unsigned char *k;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl
index 38a44a70ef..5c9ac6ad28 100644
--- a/src/lib/libcrypto/rc4/asm/rc4-586.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl
@@ -28,6 +28,34 @@
 #
 #					<appro@fy.chalmers.se>
 
+# May 2011
+#
+# Optimize for Core2 and Westmere [and incidentally Opteron]. Current
+# performance in cycles per processed byte (less is better) and
+# improvement relative to previous version of this module is:
+#
+# Pentium	10.2			# original numbers
+# Pentium III	7.8(*)
+# Intel P4	7.5
+#
+# Opteron	6.1/+20%		# new MMX numbers
+# Core2		5.3/+67%(**)
+# Westmere	5.1/+94%(**)
+# Sandy Bridge	5.0/+8%
+# Atom		12.6/+6%
+#
+# (*)	PIII can actually deliver 6.6 cycles per byte with MMX code,
+#	but this specific code performs poorly on Core2. And vice
+#	versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs
+#	poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU
+#	[anymore], I chose to discard PIII-specific code path and opt
+#	for original IALU-only code, which is why MMX/SSE code path
+#	is guarded by SSE2 bit (see below), not MMX/SSE.
+# (**)	Performance vs. block size on Core2 and Westmere had a maximum
+#	at ... 64 bytes block size. And it was quite a maximum, 40-60%
+#	in comparison to largest 8KB block size. Above improvement
+#	coefficients are for the largest block size.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
@@ -62,6 +90,68 @@ sub RC4_loop {
 	&$func	($out,&DWP(0,$dat,$ty,4));
 }
 
+if ($alt=0) {
+  # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron,
+  # but ~40% slower on Core2 and Westmere... Attempt to add movz
+  # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet
+  # on Core2 with movz it's almost 20% slower than below alternative
+  # code... Yes, it's a total mess...
+  my @XX=($xx,$out);
+  $RC4_loop_mmx = sub {		# SSE actually...
+    my $i=shift;
+    my $j=$i<=0?0:$i>>1;
+    my $mm=$i<=0?"mm0":"mm".($i&1);
+
+	&add	(&LB($yy),&LB($tx));
+	&lea	(@XX[1],&DWP(1,@XX[0]));
+	&pxor	("mm2","mm0")				if ($i==0);
+	&psllq	("mm1",8)				if ($i==0);
+	&and	(@XX[1],0xff);
+	&pxor	("mm0","mm0")				if ($i<=0);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&pxor	("mm1","mm2")				if ($i==0);
+	&mov	(&DWP(0,$dat,$XX[0],4),$ty);
+	&add	(&LB($ty),&LB($tx));
+	&movd	(@XX[0],"mm7")				if ($i==0);
+	&mov	($tx,&DWP(0,$dat,@XX[1],4));
+	&pxor	("mm1","mm1")				if ($i==1);
+	&movq	("mm2",&QWP(0,$inp))			if ($i==1);
+	&movq	(&QWP(-8,(@XX[0],$inp)),"mm1")		if ($i==0);
+	&pinsrw	($mm,&DWP(0,$dat,$ty,4),$j);
+
+	push	(@XX,shift(@XX))			if ($i>=0);
+  }
+} else {
+  # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
+  # brings down AMD by 7%...
+  $RC4_loop_mmx = sub {
+    my $i=shift;
+
+	&add	(&LB($yy),&LB($tx));
+	&psllq	("mm1",8*(($i-1)&7))			if (abs($i)!=1);
+	&mov	($ty,&DWP(0,$dat,$yy,4));
+	&mov	(&DWP(0,$dat,$yy,4),$tx);
+	&mov	(&DWP(0,$dat,$xx,4),$ty);
+	&inc	($xx);
+	&add	($ty,$tx);
+	&movz	($xx,&LB($xx));				# (*)
+	&movz	($ty,&LB($ty));				# (*)
+	&pxor	("mm2",$i==1?"mm0":"mm1")		if ($i>=0);
+	&movq	("mm0",&QWP(0,$inp))			if ($i<=0);
+	&movq	(&QWP(-8,($out,$inp)),"mm2")		if ($i==0);
+	&mov	($tx,&DWP(0,$dat,$xx,4));
+	&movd	($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
+
+	# (*)	This is the key to Core2 and Westmere performance.
+	#	Whithout movz out-of-order execution logic confuses
+	#	itself and fails to reorder loads and stores. Problem
+	#	appears to be fixed in Sandy Bridge...
+  }
+}
+
+&external_label("OPENSSL_ia32cap_P");
+
 # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out);
 &function_begin("RC4");
 	&mov	($dat,&wparam(0));	# load key schedule pointer
@@ -94,11 +184,56 @@ sub RC4_loop {
 	&and	($ty,-4);		# how many 4-byte chunks?
 	&jz	(&label("loop1"));
 
+	&test	($ty,-8);
+	&mov	(&wparam(3),$out);	# $out as accumulator in these loops
+	&jz	(&label("go4loop4"));
+
+	&picmeup($out,"OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,$out),26);	# check SSE2 bit [could have been MMX]
+	&jnc	(&label("go4loop4"));
+
+	&mov	($out,&wparam(3))	if (!$alt);
+	&movd	("mm7",&wparam(3))	if ($alt);
+	&and	($ty,-8);
+	&lea	($ty,&DWP(-8,$inp,$ty));
+	&mov	(&DWP(-4,$dat),$ty);	# save input+(len/8)*8-8
+
+	&$RC4_loop_mmx(-1);
+	&jmp(&label("loop_mmx_enter"));
+
+	&set_label("loop_mmx",16);
+		&$RC4_loop_mmx(0);
+	&set_label("loop_mmx_enter");
+		for 	($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
+		&mov	($ty,$yy);
+		&xor	($yy,$yy);		# this is second key to Core2
+		&mov	(&LB($yy),&LB($ty));	# and Westmere performance...
+		&cmp	($inp,&DWP(-4,$dat));
+		&lea	($inp,&DWP(8,$inp));
+	&jb	(&label("loop_mmx"));
+
+    if ($alt) {
+	&movd	($out,"mm7");
+	&pxor	("mm2","mm0");
+	&psllq	("mm1",8);
+	&pxor	("mm1","mm2");
+	&movq	(&QWP(-8,$out,$inp),"mm1");
+    } else {
+	&psllq	("mm1",56);
+	&pxor	("mm2","mm1");
+	&movq	(&QWP(-8,$out,$inp),"mm2");
+    }
+	&emms	();
+
+	&cmp	($inp,&wparam(1));	# compare to input+len
+	&je	(&label("done"));
+	&jmp	(&label("loop1"));
+
+&set_label("go4loop4",16);
 	&lea	($ty,&DWP(-4,$inp,$ty));
 	&mov	(&wparam(2),$ty);	# save input+(len/4)*4-4
-	&mov	(&wparam(3),$out);	# $out as accumulator in this loop
 
-	&set_label("loop4",16);
+	&set_label("loop4");
 		for ($i=0;$i<4;$i++) { RC4_loop($i); }
 		&ror	($out,8);
 		&xor	($out,&DWP(0,$inp));
@@ -151,7 +286,7 @@ sub RC4_loop {
 
 &set_label("done");
 	&dec	(&LB($xx));
-	&mov	(&BP(-4,$dat),&LB($yy));	# save key->y
+	&mov	(&DWP(-4,$dat),$yy);		# save key->y
 	&mov	(&BP(-8,$dat),&LB($xx));	# save key->x
 &set_label("abort");
 &function_end("RC4");
@@ -164,10 +299,8 @@ $idi="ebp";
 $ido="ecx";
 $idx="edx";
 
-&external_label("OPENSSL_ia32cap_P");
-
 # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
-&function_begin("RC4_set_key");
+&function_begin("private_RC4_set_key");
 	&mov	($out,&wparam(0));		# load key
 	&mov	($idi,&wparam(1));		# load len
 	&mov	($inp,&wparam(2));		# load data
@@ -245,7 +378,7 @@ $idx="edx";
 	&xor	("eax","eax");
 	&mov	(&DWP(-8,$out),"eax");		# key->x=0;
 	&mov	(&DWP(-4,$out),"eax");		# key->y=0;
-&function_end("RC4_set_key");
+&function_end("private_RC4_set_key");
 
 # const char *RC4_options(void);
 &function_begin_B("RC4_options");
@@ -254,14 +387,21 @@ $idx="edx";
 	&blindpop("eax");
 	&lea	("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
 	&picmeup("edx","OPENSSL_ia32cap_P");
-	&bt	(&DWP(0,"edx"),20);
-	&jnc	(&label("skip"));
-	  &add	("eax",12);
-	&set_label("skip");
+	&mov	("edx",&DWP(0,"edx"));
+	&bt	("edx",20);
+	&jc	(&label("1xchar"));
+	&bt	("edx",26);
+	&jnc	(&label("ret"));
+	&add	("eax",25);
+	&ret	();
+&set_label("1xchar");
+	&add	("eax",12);
+&set_label("ret");
 	&ret	();
 &set_label("opts",64);
 &asciz	("rc4(4x,int)");
 &asciz	("rc4(1x,char)");
+&asciz	("rc4(8x,mmx)");
 &asciz	("RC4 for x86, CRYPTOGAMS by <appro\@openssl.org>");
 &align	(64);
 &function_end_B("RC4_options");
diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
new file mode 100644
index 0000000000..7f684092d4
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl
@@ -0,0 +1,631 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# June 2011
+#
+# This is RC4+MD5 "stitch" implementation. The idea, as spelled in
+# http://download.intel.com/design/intarch/papers/323686.pdf, is that
+# since both algorithms exhibit instruction-level parallelism, ILP,
+# below theoretical maximum, interleaving them would allow to utilize
+# processor resources better and achieve better performance. RC4
+# instruction sequence is virtually identical to rc4-x86_64.pl, which
+# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin
+# and Jim Guilford of Intel. MD5 is fresh implementation aiming to
+# minimize register usage, which was used as "main thread" with RC4
+# weaved into it, one RC4 round per one MD5 round. In addition to the
+# stiched subroutine the script can generate standalone replacement
+# md5_block_asm_data_order and RC4. Below are performance numbers in
+# cycles per processed byte, less is better, for these the standalone
+# subroutines, sum of them, and stitched one:
+#
+#		RC4	MD5	RC4+MD5	stitch	gain
+# Opteron	6.5(*)	5.4	11.9	7.0	+70%(*)
+# Core2		6.5	5.8	12.3	7.7	+60%
+# Westmere	4.3	5.2	9.5	7.0	+36%
+# Sandy Bridge	4.2	5.5	9.7	6.8	+43%
+# Atom		9.3	6.5	15.8	11.1	+42%
+#
+# (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
+#	is +53%...
+
+my ($rc4,$md5)=(1,1);	# what to generate?
+my $D="#" if (!$md5);	# if set to "#", MD5 is stitched into RC4(),
+			# but its result is discarded. Idea here is
+			# to be able to use 'openssl speed rc4' for
+			# benchmarking the stitched subroutine... 
+
+my $flavour = shift;
+my $output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs);
+
+if ($rc4 && !$md5) {
+  ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx");
+  $func="RC4";				$nargs=4;
+} elsif ($md5 && !$rc4) {
+  ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx");
+  $func="md5_block_asm_data_order";	$nargs=3;
+} else {
+  ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+  $func="rc4_md5_enc";			$nargs=6;
+  # void rc4_md5_enc(
+  #		RC4_KEY *key,		#
+  #		const void *in0,	# RC4 input
+  #		void *out,		# RC4 output
+  #		MD5_CTX *ctx,		#
+  #		const void *inp,	# MD5 input
+  #		size_t len);		# number of 64-byte blocks
+}
+
+my @K=(	0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
+	0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
+	0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
+	0x6b901122,0xfd987193,0xa679438e,0x49b40821,
+
+	0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
+	0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
+	0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
+	0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
+
+	0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
+	0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
+	0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
+	0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
+
+	0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
+	0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
+	0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
+	0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391	);
+
+my @V=("%r8d","%r9d","%r10d","%r11d");	# MD5 registers
+my $tmp="%r12d";
+
+my @XX=("%rbp","%rsi");			# RC4 registers
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+my $MOD=32;				# 16, 32 or 64
+
+$code.=<<___;
+.text
+.align 16
+
+.globl	$func
+.type	$func,\@function,$nargs
+$func:
+	cmp	\$0,$len
+	je	.Labort
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	sub	\$40,%rsp
+.Lbody:
+___
+if ($rc4) {
+$code.=<<___;
+$D#md5#	mov	$ctx,%r11		# reassign arguments
+	mov	$len,%r12
+	mov	$in0,%r13
+	mov	$out,%r14
+$D#md5#	mov	$inp,%r15
+___
+    $ctx="%r11"	if ($md5);		# reassign arguments
+    $len="%r12";
+    $in0="%r13";
+    $out="%r14";
+    $inp="%r15"	if ($md5);
+    $inp=$in0	if (!$md5);
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
+
+	inc	$XX[0]#b
+	sub	$in0,$out
+	movl	($dat,$XX[0],4),$TX[0]#d
+___
+$code.=<<___ if (!$md5);
+	xor	$TX[1],$TX[1]
+	test	\$-128,$len
+	jz	.Loop1
+	sub	$XX[0],$TX[1]
+	and	\$`$MOD-1`,$TX[1]
+	jz	.Loop${MOD}_is_hot
+	sub	$TX[1],$len
+.Loop${MOD}_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$TX[1]
+	jnz	.Loop${MOD}_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop${MOD}_is_hot:
+	mov	$len,32(%rsp)		# save original $len
+	shr	\$6,$len		# number of 64-byte blocks
+___
+  if ($D && !$md5) {			# stitch in dummy MD5
+    $md5=1;
+    $ctx="%r11";
+    $inp="%r15";
+    $code.=<<___;
+	mov	%rsp,$ctx
+	mov	$in0,$inp
+___
+  }
+}
+$code.=<<___;
+#rc4#	add	$TX[0]#b,$YY#b
+#rc4#	lea	($dat,$XX[0],4),$XX[1]
+	shl	\$6,$len
+	add	$inp,$len		# pointer to the end of input
+	mov	$len,16(%rsp)
+
+#md5#	mov	$ctx,24(%rsp)		# save pointer to MD5_CTX
+#md5#	mov	0*4($ctx),$V[0]		# load current hash value from MD5_CTX
+#md5#	mov	1*4($ctx),$V[1]
+#md5#	mov	2*4($ctx),$V[2]
+#md5#	mov	3*4($ctx),$V[3]
+	jmp	.Loop
+
+.align	16
+.Loop:
+#md5#	mov	$V[0],0*4(%rsp)		# put aside current hash value
+#md5#	mov	$V[1],1*4(%rsp)
+#md5#	mov	$V[2],2*4(%rsp)
+#md5#	mov	$V[3],$tmp		# forward reference
+#md5#	mov	$V[3],3*4(%rsp)
+___
+
+sub R0 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot0=(7,12,17,22);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	($in0),%xmm2\n"		if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$b,$tmp
+#md5#	add	4*`$j`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$d,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot0[$j%4],$a
+#md5#	mov	`$j==15?"$b":"$c"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+___
+}
+sub R1 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot1=(5,9,14,20);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	16($in0),%xmm3\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$b,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	and	$d,$tmp
+#md5#	add	4*`((1+5*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#md5#	xor	$c,$tmp
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot1[$j%4],$a
+#md5#	mov	`$j==15?"$c":"$b"`,$tmp		# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+___
+}
+sub R2 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot2=(4,11,16,23);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	32($in0),%xmm4\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	xor	$b,$tmp
+#md5#	add	4*`((5+3*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	add	$tmp,$a
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot2[$j%4],$a
+#md5#	mov	`$j==15?"\\\$-1":"$c"`,$tmp	# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1);
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm4
+	pxor	%xmm1,%xmm4
+___
+}
+sub R3 {
+  my ($i,$a,$b,$c,$d)=@_;
+  my @rot3=(6,10,15,21);
+  my $j=$i%16;
+  my $k=$i%$MOD;
+  my $xmm="%xmm".($j&1);
+    $code.="	movdqu	48($in0),%xmm5\n"	if ($rc4 && $j==15);
+    $code.="	add	\$$MOD,$XX[0]#b\n"	if ($rc4 && $j==15 && $k==$MOD-1);
+    $code.="	pxor	$xmm,$xmm\n"		if ($rc4 && $j<=1);
+    $code.=<<___;
+#rc4#	movl	($dat,$YY,4),$TY#d
+#md5#	xor	$d,$tmp
+#rc4#	movl	$TX[0]#d,($dat,$YY,4)
+#md5#	or	$b,$tmp
+#md5#	add	4*`((7*$j)%16)`($inp),$a
+#rc4#	add	$TY#b,$TX[0]#b
+#rc4#	movl	`4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d
+#md5#	add	\$$K[$i],$a
+#rc4#	movz	$TX[0]#b,$TX[0]#d
+#md5#	xor	$c,$tmp
+#rc4#	movl	$TY#d,4*$k($XX[1])
+#md5#	add	$tmp,$a
+#rc4#	add	$TX[1]#b,$YY#b
+#md5#	rol	\$$rot3[$j%4],$a
+#md5#	mov	\$-1,$tmp			# forward reference
+#rc4#	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n
+#md5#	add	$b,$a
+___
+    $code.=<<___ if ($rc4 && $j==15);
+	mov	$XX[0],$XX[1]
+	xor	$XX[0],$XX[0]			# keyword to partial register
+	mov	$XX[1]#b,$XX[0]#b
+	mov	$YY,$XX[1]
+	xor	$YY,$YY				# keyword to partial register
+	mov	$XX[1]#b,$YY#b
+	lea	($dat,$XX[0],4),$XX[1]
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm5
+	pxor	%xmm1,%xmm5
+___
+}
+
+my $i=0;
+for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); }
+
+$code.=<<___;
+#md5#	add	0*4(%rsp),$V[0]		# accumulate hash value
+#md5#	add	1*4(%rsp),$V[1]
+#md5#	add	2*4(%rsp),$V[2]
+#md5#	add	3*4(%rsp),$V[3]
+
+#rc4#	movdqu	%xmm2,($out,$in0)	# write RC4 output
+#rc4#	movdqu	%xmm3,16($out,$in0)
+#rc4#	movdqu	%xmm4,32($out,$in0)
+#rc4#	movdqu	%xmm5,48($out,$in0)
+#md5#	lea	64($inp),$inp
+#rc4#	lea	64($in0),$in0
+	cmp	16(%rsp),$inp		# are we done?
+	jb	.Loop
+
+#md5#	mov	24(%rsp),$len		# restore pointer to MD5_CTX
+#rc4#	sub	$TX[0]#b,$YY#b		# correct $YY
+#md5#	mov	$V[0],0*4($len)		# write MD5_CTX
+#md5#	mov	$V[1],1*4($len)
+#md5#	mov	$V[2],2*4($len)
+#md5#	mov	$V[3],3*4($len)
+___
+$code.=<<___ if ($rc4 && (!$md5 || $D));
+	mov	32(%rsp),$len		# restore original $len
+	and	\$63,$len		# remaining bytes
+	jnz	.Loop1
+	jmp	.Ldone
+	
+.align	16
+.Loop1:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($in0),$TY#b
+	movb	$TY#b,($out,$in0)
+	lea	1($in0),$in0
+	dec	$len
+	jnz	.Loop1
+
+.Ldone:
+___
+$code.=<<___;
+#rc4#	sub	\$1,$XX[0]#b
+#rc4#	movl	$XX[0]#d,-8($dat)
+#rc4#	movl	$YY#d,-4($dat)
+
+	mov	40(%rsp),%r15
+	mov	48(%rsp),%r14
+	mov	56(%rsp),%r13
+	mov	64(%rsp),%r12
+	mov	72(%rsp),%rbp
+	mov	80(%rsp),%rbx
+	lea	88(%rsp),%rsp
+.Lepilogue:
+.Labort:
+	ret
+.size $func,.-$func
+___
+
+if ($rc4 && $D) {	# sole purpose of this section is to provide
+			# option to use the generated module as drop-in
+			# replacement for rc4-x86_64.pl for debugging
+			# and testing purposes...
+my ($idx,$ido)=("%r8","%r9");
+my ($dat,$len,$inp)=("%rdi","%rsi","%rdx");
+
+$code.=<<___;
+.globl	RC4_set_key
+.type	RC4_set_key,\@function,3
+.align	16
+RC4_set_key:
+	lea	8($dat),$dat
+	lea	($inp,$len),$inp
+	neg	$len
+	mov	$len,%rcx
+	xor	%eax,%eax
+	xor	$ido,$ido
+	xor	%r10,%r10
+	xor	%r11,%r11
+	jmp	.Lw1stloop
+
+.align	16
+.Lw1stloop:
+	mov	%eax,($dat,%rax,4)
+	add	\$1,%al
+	jnc	.Lw1stloop
+
+	xor	$ido,$ido
+	xor	$idx,$idx
+.align	16
+.Lw2ndloop:
+	mov	($dat,$ido,4),%r10d
+	add	($inp,$len,1),$idx#b
+	add	%r10b,$idx#b
+	add	\$1,$len
+	mov	($dat,$idx,4),%r11d
+	cmovz	%rcx,$len
+	mov	%r10d,($dat,$idx,4)
+	mov	%r11d,($dat,$ido,4)
+	add	\$1,$ido#b
+	jnc	.Lw2ndloop
+
+	xor	%eax,%eax
+	mov	%eax,-8($dat)
+	mov	%eax,-4($dat)
+	ret
+.size	RC4_set_key,.-RC4_set_key
+
+.globl	RC4_options
+.type	RC4_options,\@abi-omnipotent
+.align	16
+RC4_options:
+	lea	.Lopts(%rip),%rax
+	ret
+.align	64
+.Lopts:
+.asciz	"rc4(64x,int)"
+.align	64
+.size	RC4_options,.-RC4_options
+___
+}
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<.Lbody
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
+	jae	.Lin_prologue
+
+	mov	40(%rax),%r15
+	mov	48(%rax),%r14
+	mov	56(%rax),%r13
+	mov	64(%rax),%r12
+	mov	72(%rax),%rbp
+	mov	80(%rax),%rbx
+	lea	88(%rax),%rax
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R12
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_$func
+	.rva	.LSEH_end_$func
+	.rva	.LSEH_info_$func
+
+.section	.xdata
+.align	8
+.LSEH_info_$func:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)     { $reg .= $conv; }
+    elsif ($conv eq "b")        { $reg =~ s/%[er]([^x]+)x?/%$1l/;       }
+    elsif ($conv eq "w")        { $reg =~ s/%[er](.+)/%$1/;             }
+    elsif ($conv eq "d")        { $reg =~ s/%[er](.+)/%e$1/;            }
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/pinsrw\s+\$0,/movd	/gm;
+
+$code =~ s/#md5#//gm	if ($md5);
+$code =~ s/#rc4#//gm	if ($rc4);
+
+print $code;
+
+close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
new file mode 100644
index 0000000000..9165067080
--- /dev/null
+++ b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# RC4 for PA-RISC.
+
+# June 2009.
+#
+# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
+# For reference, [4x] unrolled loop is >40% faster than folded one.
+# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
+# is believed to be not sufficient to justify the effort...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=4*$SIZE_T+$FRAME_MARKER;	# 4 saved regs + frame marker
+				#                [+ argument transfer]
+$SZ=1;				# defaults to RC4_CHAR
+if (open CONF,"<${dir}../../opensslconf.h") {
+    while(<CONF>) {
+	if (m/#\s*define\s+RC4_INT\s+(.*)/) {
+	    $SZ = ($1=~/char$/) ? 1 : 4;
+	    last;
+	}
+    }
+    close CONF;
+}
+
+if ($SZ==1) {	# RC4_CHAR
+    $LD="ldb";
+    $LDX="ldbx";
+    $MKX="addl";
+    $ST="stb";
+} else {	# RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
+    $LD="ldw";
+    $LDX="ldwx,s";
+    $MKX="sh2addl";
+    $ST="stw";
+}
+
+$key="%r26";
+$len="%r25";
+$inp="%r24";
+$out="%r23";
+
+@XX=("%r19","%r20");
+@TX=("%r21","%r22");
+$YY="%r28";
+$TY="%r29";
+
+$acc="%r1";
+$ix="%r2";
+$iy="%r3";
+$dat0="%r4";
+$dat1="%r5";
+$rem="%r6";
+$mask="%r31";
+
+sub unrolledloopbody {
+for ($i=0;$i<4;$i++) {
+$code.=<<___;
+	ldo	1($XX[0]),$XX[1]
+	`sprintf("$LDX	%$TY(%$key),%$dat1") if ($i>0)`	
+	and	$mask,$XX[1],$XX[1]
+	$LDX	$YY($key),$TY
+	$MKX	$YY,$key,$ix
+	$LDX	$XX[1]($key),$TX[1]
+	$MKX	$XX[0],$key,$iy
+	$ST	$TX[0],0($ix)
+	comclr,<> $XX[1],$YY,%r0	; conditional
+	copy	$TX[0],$TX[1]		; move
+	`sprintf("%sdep	%$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
+	$ST	$TY,0($iy)
+	addl	$TX[0],$TY,$TY
+	addl	$TX[1],$YY,$YY
+	and	$mask,$TY,$TY
+	and	$mask,$YY,$YY
+___
+push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+} }
+
+sub foldedloop {
+my ($label,$count)=@_;
+$code.=<<___;
+$label
+	$MKX	$YY,$key,$iy
+	$LDX	$YY($key),$TY
+	$MKX	$XX[0],$key,$ix
+	$ST	$TX[0],0($iy)
+	ldo	1($XX[0]),$XX[0]
+	$ST	$TY,0($ix)
+	addl	$TX[0],$TY,$TY
+	ldbx	$inp($out),$dat1
+	and	$mask,$TY,$TY
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$TY($key),$acc
+	$LDX	$XX[0]($key),$TX[0]
+	ldo	1($out),$out
+	xor	$dat1,$acc,$acc
+	addl	$TX[0],$YY,$YY
+	stb	$acc,-1($out)
+	addib,<> -1,$count,$label	; $count is always small
+	and	$mask,$YY,$YY
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+RC4
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+
+	cmpib,*= 0,$len,L\$abort
+	sub	$inp,$out,$inp		; distance between $inp and $out
+
+	$LD	`0*$SZ`($key),$XX[0]
+	$LD	`1*$SZ`($key),$YY
+	ldo	`2*$SZ`($key),$key
+
+	ldi	0xff,$mask
+	ldi	3,$dat0		
+
+	ldo	1($XX[0]),$XX[0]	; warm up loop
+	and	$mask,$XX[0],$XX[0]
+	$LDX	$XX[0]($key),$TX[0]
+	addl	$TX[0],$YY,$YY
+	cmpib,*>>= 6,$len,L\$oop1	; is $len large enough to bother?
+	and	$mask,$YY,$YY
+
+	and,<>	$out,$dat0,$rem		; is $out aligned?
+	b	L\$alignedout
+	subi	4,$rem,$rem
+	sub	$len,$rem,$len
+___
+&foldedloop("L\$alignout",$rem);	# process till $out is aligned
+
+$code.=<<___;
+L\$alignedout				; $len is at least 4 here
+	and,<>	$inp,$dat0,$acc		; is $inp aligned?
+	b	L\$oop4
+	sub	$inp,$acc,$rem		; align $inp
+
+	sh3addl	$acc,%r0,$acc
+	subi	32,$acc,$acc
+	mtctl	$acc,%cr11		; load %sar with vshd align factor
+	ldwx	$rem($out),$dat0
+	ldo	4($rem),$rem
+L\$oop4misalignedinp
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$rem($out),$dat1
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	vshd	$dat0,$dat1,$iy		; align data
+	copy	$dat1,$dat0
+	xor	$iy,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4misalignedinp
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+	b	L\$oop1
+	nop
+
+	.ALIGN	8
+L\$oop4
+___
+&unrolledloopbody();
+$code.=<<___;
+	$LDX	$TY($key),$ix
+	ldwx	$inp($out),$dat0
+	ldo	-4($len),$len
+	or	$ix,$acc,$acc		; last piece, no need to dep
+	xor	$dat0,$acc,$acc
+	stw	$acc,0($out)
+	cmpib,*<< 3,$len,L\$oop4
+	ldo	4($out),$out
+	cmpib,*= 0,$len,L\$done
+	nop
+___
+&foldedloop("L\$oop1",$len);
+$code.=<<___;
+L\$done
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2
+	ldo	-1($XX[0]),$XX[0]	; chill out loop
+	sub	$YY,$TX[0],$YY
+	and	$mask,$XX[0],$XX[0]
+	and	$mask,$YY,$YY
+	$ST	$XX[0],`-2*$SZ`($key)
+	$ST	$YY,`-1*$SZ`($key)
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+___
+
+$code.=<<___;
+
+	.EXPORT	private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	8
+private_RC4_set_key
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	$ST	%r0,`0*$SZ`($key)
+	$ST	%r0,`1*$SZ`($key)
+	ldo	`2*$SZ`($key),$key
+	copy	%r0,@XX[0]
+L\$1st
+	$ST	@XX[0],0($key)
+	ldo	1(@XX[0]),@XX[0]
+	bb,>=	@XX[0],`31-8`,L\$1st	; @XX[0]<256
+	ldo	$SZ($key),$key
+
+	ldo	`-256*$SZ`($key),$key	; rewind $key
+	addl	$len,$inp,$inp		; $inp to point at the end
+	sub	%r0,$len,%r23		; inverse index
+	copy	%r0,@XX[0]
+	copy	%r0,@XX[1]
+	ldi	0xff,$mask
+
+L\$2nd
+	$LDX	@XX[0]($key),@TX[0]
+	ldbx	%r23($inp),@TX[1]
+	addi,nuv 1,%r23,%r23		; increment and conditional
+	sub	%r0,$len,%r23		; inverse index
+	addl	@TX[0],@XX[1],@XX[1]
+	addl	@TX[1],@XX[1],@XX[1]
+	and	$mask,@XX[1],@XX[1]
+	$MKX	@XX[0],$key,$TY
+	$LDX	@XX[1]($key),@TX[1]
+	$MKX	@XX[1],$key,$YY
+	ldo	1(@XX[0]),@XX[0]
+	$ST	@TX[0],0($YY)
+	bb,>=	@XX[0],`31-8`,L\$2nd	; @XX[0]<256
+	$ST	@TX[1],0($TY)
+
+	bv,n	(%r2)
+	.EXIT
+	nop
+	.PROCEND
+
+	.EXPORT	RC4_options,ENTRY
+	.ALIGN	8
+RC4_options
+	.PROC
+	.CALLINFO	NO_CALLS
+	.ENTRY
+	blr	%r0,%r28
+	ldi	3,%r1
+L\$pic
+	andcm	%r28,%r1,%r28
+	bv	(%r2)
+	.EXIT
+	ldo	L\$opts-L\$pic(%r28),%r28
+	.PROCEND
+	.ALIGN	8
+L\$opts
+	.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
+	.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
+
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
index 96681fa05e..7528ece13c 100644
--- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl
@@ -13,6 +13,29 @@
 # "cluster" Address Generation Interlocks, so that one pipeline stall
 # resolves several dependencies.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
 $rp="%r14";
 $sp="%r15";
 $code=<<___;
@@ -39,7 +62,12 @@ $code.=<<___;
 .type	RC4,\@function
 .align	64
 RC4:
-	stmg	%r6,%r11,48($sp)
+	stm${g}	%r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	llgfr	$len,$len
+___
+$code.=<<___;
 	llgc	$XX[0],0($key)
 	llgc	$YY,1($key)
 	la	$XX[0],1($XX[0])
@@ -90,7 +118,7 @@ $code.=<<___;
 	xgr	$acc,$TX[1]
 	stg	$acc,0($out)
 	la	$out,8($out)
-	brct	$cnt,.Loop8
+	brctg	$cnt,.Loop8
 
 .Lshort:
 	lghi	$acc,7
@@ -122,7 +150,7 @@ $code.=<<___;
 	ahi	$XX[0],-1
 	stc	$XX[0],0($key)
 	stc	$YY,1($key)
-	lmg	%r6,%r11,48($sp)
+	lm${g}	%r6,%r11,6*$SIZE_T($sp)
 	br	$rp
 .size	RC4,.-RC4
 .string	"RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -143,11 +171,11 @@ $ikey="%r7";
 $iinp="%r8";
 
 $code.=<<___;
-.globl	RC4_set_key
-.type	RC4_set_key,\@function
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,\@function
 .align	64
-RC4_set_key:
-	stmg	%r6,%r8,48($sp)
+private_RC4_set_key:
+	stm${g}	%r6,%r8,6*$SIZE_T($sp)
 	lhi	$cnt,256
 	la	$idx,0(%r0)
 	sth	$idx,0($key)
@@ -180,9 +208,9 @@ RC4_set_key:
 	la	$iinp,0(%r0)
 	j	.L2ndloop
 .Ldone:
-	lmg	%r6,%r8,48($sp)
+	lm${g}	%r6,%r8,6*$SIZE_T($sp)
 	br	$rp
-.size	RC4_set_key,.-RC4_set_key
+.size	private_RC4_set_key,.-private_RC4_set_key
 
 ___
 }
@@ -203,3 +231,4 @@ RC4_options:
 ___
 
 print $code;
+close STDOUT;	# force flush
diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
index 677be5fe25..d6eac205e9 100755
--- a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
+++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl
@@ -7,6 +7,8 @@
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 #
+# July 2004
+#
 # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
 # "hand-coded assembler"] doesn't stand for the whole improvement
 # coefficient. It turned out that eliminating RC4_CHAR from config
@@ -19,6 +21,8 @@
 # to operate on partial registers, it turned out to be the best bet.
 # At least for AMD... How IA32E would perform remains to be seen...
 
+# November 2004
+#
 # As was shown by Marc Bevand reordering of couple of load operations
 # results in even higher performance gain of 3.3x:-) At least on
 # Opteron... For reference, 1x in this case is RC4_CHAR C-code
@@ -26,6 +30,8 @@
 # Latter means that if you want to *estimate* what to expect from
 # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz.
 
+# November 2004
+#
 # Intel P4 EM64T core was found to run the AMD64 code really slow...
 # The only way to achieve comparable performance on P4 was to keep
 # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
@@ -33,10 +39,14 @@
 # on either AMD and Intel platforms, I implement both cases. See
 # rc4_skey.c for further details...
 
+# April 2005
+#
 # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
 # those with add/sub results in 50% performance improvement of folded
 # loop...
 
+# May 2005
+#
 # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
 # performance by >30% [unlike P4 32-bit case that is]. But this is
 # provided that loads are reordered even more aggressively! Both code
@@ -50,6 +60,8 @@
 # is not implemented, then this final RC4_CHAR code-path should be
 # preferred, as it provides better *all-round* performance].
 
+# March 2007
+#
 # Intel Core2 was observed to perform poorly on both code paths:-( It
 # apparently suffers from some kind of partial register stall, which
 # occurs in 64-bit mode only [as virtually identical 32-bit loop was
@@ -58,6 +70,37 @@
 # fit for Core2 and therefore the code was modified to skip cloop8 on
 # this CPU.
 
+# May 2010
+#
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
+# May 2011
+#
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
+#
+# Opteron	5.3/+0%(*)
+# P4		6.5
+# Core2		6.2/+15%(**)
+# Westmere	4.2/+60%
+# Sandy Bridge	4.2/+120%
+# Atom		9.3/+80%
+#
+# (*)	But corresponding loop has less instructions, which should have
+#	positive effect on upcoming Bulldozer, which has one less ALU.
+#	For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**)	Note that Core2 result is ~15% lower than corresponding result
+#	for 32-bit code, meaning that it's possible to improve it,
+#	but more than likely at the cost of the others (see rc4-586.pl
+#	to get the idea)...
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -76,13 +119,10 @@ $len="%rsi";	    # arg2
 $inp="%rdx";	    # arg3
 $out="%rcx";	    # arg4
 
-@XX=("%r8","%r10");
-@TX=("%r9","%r11");
-$YY="%r12";
-$TY="%r13";
-
+{
 $code=<<___;
 .text
+.extern	OPENSSL_ia32cap_P
 
 .globl	RC4
 .type	RC4,\@function,4
@@ -95,48 +135,173 @@ RC4:	or	$len,$len
 	push	%r12
 	push	%r13
 .Lprologue:
+	mov	$len,%r11
+	mov	$inp,%r12
+	mov	$out,%r13
+___
+my $len="%r11";		# reassign input arguments
+my $inp="%r12";
+my $out="%r13";
 
-	add	\$8,$dat
-	movl	-8($dat),$XX[0]#d
-	movl	-4($dat),$YY#d
+my @XX=("%r10","%rsi");
+my @TX=("%rax","%rbx");
+my $YY="%rcx";
+my $TY="%rdx";
+
+$code.=<<___;
+	xor	$XX[0],$XX[0]
+	xor	$YY,$YY
+
+	lea	8($dat),$dat
+	mov	-8($dat),$XX[0]#b
+	mov	-4($dat),$YY#b
 	cmpl	\$-1,256($dat)
 	je	.LRC4_CHAR
+	mov	OPENSSL_ia32cap_P(%rip),%r8d
+	xor	$TX[1],$TX[1]
 	inc	$XX[0]#b
+	sub	$XX[0],$TX[1]
+	sub	$inp,$out
 	movl	($dat,$XX[0],4),$TX[0]#d
-	test	\$-8,$len
+	test	\$-16,$len
 	jz	.Lloop1
-	jmp	.Lloop8
+	bt	\$30,%r8d	# Intel CPU?
+	jc	.Lintel
+	and	\$7,$TX[1]
+	lea	1($XX[0]),$XX[1]
+	jz	.Loop8
+	sub	$TX[1],$len
+.Loop8_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop8_warmup
+
+	lea	1($XX[0]),$XX[1]
+	jmp	.Loop8
 .align	16
-.Lloop8:
+.Loop8:
 ___
 for ($i=0;$i<8;$i++) {
+$code.=<<___ if ($i==7);
+	add	\$8,$XX[1]#b
+___
 $code.=<<___;
 	add	$TX[0]#b,$YY#b
-	mov	$XX[0],$XX[1]
 	movl	($dat,$YY,4),$TY#d
-	ror	\$8,%rax			# ror is redundant when $i=0
-	inc	$XX[1]#b
-	movl	($dat,$XX[1],4),$TX[1]#d
-	cmp	$XX[1],$YY
 	movl	$TX[0]#d,($dat,$YY,4)
-	cmove	$TX[0],$TX[1]
-	movl	$TY#d,($dat,$XX[0],4)
+	movl	`4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d
+	ror	\$8,%r8				# ror is redundant when $i=0
+	movl	$TY#d,4*$i($dat,$XX[0],4)
 	add	$TX[0]#b,$TY#b
-	movb	($dat,$TY,4),%al
+	movb	($dat,$TY,4),%r8b
 ___
-push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
+push(@TX,shift(@TX)); #push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
-	ror	\$8,%rax
+	add	\$8,$XX[0]#b
+	ror	\$8,%r8
 	sub	\$8,$len
 
-	xor	($inp),%rax
-	add	\$8,$inp
-	mov	%rax,($out)
-	add	\$8,$out
+	xor	($inp),%r8
+	mov	%r8,($out,$inp)
+	lea	8($inp),$inp
 
 	test	\$-8,$len
-	jnz	.Lloop8
+	jnz	.Loop8
+	cmp	\$0,$len
+	jne	.Lloop1
+	jmp	.Lexit
+
+.align	16
+.Lintel:
+	test	\$-32,$len
+	jz	.Lloop1
+	and	\$15,$TX[1]
+	jz	.Loop16_is_hot
+	sub	$TX[1],$len
+.Loop16_warmup:
+	add	$TX[0]#b,$YY#b
+	movl	($dat,$YY,4),$TY#d
+	movl	$TX[0]#d,($dat,$YY,4)
+	movl	$TY#d,($dat,$XX[0],4)
+	add	$TY#b,$TX[0]#b
+	inc	$XX[0]#b
+	movl	($dat,$TX[0],4),$TY#d
+	movl	($dat,$XX[0],4),$TX[0]#d
+	xorb	($inp),$TY#b
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
+	dec	$TX[1]
+	jnz	.Loop16_warmup
+
+	mov	$YY,$TX[1]
+	xor	$YY,$YY
+	mov	$TX[1]#b,$YY#b
+
+.Loop16_is_hot:
+	lea	($dat,$XX[0],4),$XX[1]
+___
+sub RC4_loop {
+  my $i=shift;
+  my $j=$i<0?0:$i;
+  my $xmm="%xmm".($j&1);
+
+    $code.="	add	\$16,$XX[0]#b\n"		if ($i==15);
+    $code.="	movdqu	($inp),%xmm2\n"			if ($i==15);
+    $code.="	add	$TX[0]#b,$YY#b\n"		if ($i<=0);
+    $code.="	movl	($dat,$YY,4),$TY#d\n";
+    $code.="	pxor	%xmm0,%xmm2\n"			if ($i==0);
+    $code.="	psllq	\$8,%xmm1\n"			if ($i==0);
+    $code.="	pxor	$xmm,$xmm\n"			if ($i<=1);
+    $code.="	movl	$TX[0]#d,($dat,$YY,4)\n";
+    $code.="	add	$TY#b,$TX[0]#b\n";
+    $code.="	movl	`4*($j+1)`($XX[1]),$TX[1]#d\n"	if ($i<15);
+    $code.="	movz	$TX[0]#b,$TX[0]#d\n";
+    $code.="	movl	$TY#d,4*$j($XX[1])\n";
+    $code.="	pxor	%xmm1,%xmm2\n"			if ($i==0);
+    $code.="	lea	($dat,$XX[0],4),$XX[1]\n"	if ($i==15);
+    $code.="	add	$TX[1]#b,$YY#b\n"		if ($i<15);
+    $code.="	pinsrw	\$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n";
+    $code.="	movdqu	%xmm2,($out,$inp)\n"		if ($i==0);
+    $code.="	lea	16($inp),$inp\n"		if ($i==0);
+    $code.="	movl	($XX[1]),$TX[1]#d\n"		if ($i==15);
+}
+	RC4_loop(-1);
+$code.=<<___;
+	jmp	.Loop16_enter
+.align	16
+.Loop16:
+___
+
+for ($i=0;$i<16;$i++) {
+    $code.=".Loop16_enter:\n"		if ($i==1);
+	RC4_loop($i);
+	push(@TX,shift(@TX)); 		# "rotate" registers
+}
+$code.=<<___;
+	mov	$YY,$TX[1]
+	xor	$YY,$YY			# keyword to partial register
+	sub	\$16,$len
+	mov	$TX[1]#b,$YY#b
+	test	\$-16,$len
+	jnz	.Loop16
+
+	psllq	\$8,%xmm1
+	pxor	%xmm0,%xmm2
+	pxor	%xmm1,%xmm2
+	movdqu	%xmm2,($out,$inp)
+	lea	16($inp),$inp
+
 	cmp	\$0,$len
 	jne	.Lloop1
 	jmp	.Lexit
@@ -152,9 +317,8 @@ $code.=<<___;
 	movl	($dat,$TX[0],4),$TY#d
 	movl	($dat,$XX[0],4),$TX[0]#d
 	xorb	($inp),$TY#b
-	inc	$inp
-	movb	$TY#b,($out)
-	inc	$out
+	movb	$TY#b,($out,$inp)
+	lea	1($inp),$inp
 	dec	$len
 	jnz	.Lloop1
 	jmp	.Lexit
@@ -165,13 +329,11 @@ $code.=<<___;
 	movzb	($dat,$XX[0]),$TX[0]#d
 	test	\$-8,$len
 	jz	.Lcloop1
-	cmpl	\$0,260($dat)
-	jnz	.Lcloop1
 	jmp	.Lcloop8
 .align	16
 .Lcloop8:
-	mov	($inp),%eax
-	mov	4($inp),%ebx
+	mov	($inp),%r8d
+	mov	4($inp),%r9d
 ___
 # unroll 2x4-wise, because 64-bit rotates kill Intel P4...
 for ($i=0;$i<4;$i++) {
@@ -188,8 +350,8 @@ $code.=<<___;
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%al
-	ror	\$8,%eax
+	xor	($dat,$TY),%r8b
+	ror	\$8,%r8d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
@@ -207,16 +369,16 @@ $code.=<<___;
 	mov	$TX[0],$TX[1]
 .Lcmov$i:
 	add	$TX[0]#b,$TY#b
-	xor	($dat,$TY),%bl
-	ror	\$8,%ebx
+	xor	($dat,$TY),%r9b
+	ror	\$8,%r9d
 ___
 push(@TX,shift(@TX)); push(@XX,shift(@XX));	# "rotate" registers
 }
 $code.=<<___;
 	lea	-8($len),$len
-	mov	%eax,($out)
+	mov	%r8d,($out)
 	lea	8($inp),$inp
-	mov	%ebx,4($out)
+	mov	%r9d,4($out)
 	lea	8($out),$out
 
 	test	\$-8,$len
@@ -229,6 +391,7 @@ $code.=<<___;
 .align	16
 .Lcloop1:
 	add	$TX[0]#b,$YY#b
+	movzb	$YY#b,$YY#d
 	movzb	($dat,$YY),$TY#d
 	movb	$TX[0]#b,($dat,$YY)
 	movb	$TY#b,($dat,$XX[0])
@@ -260,16 +423,16 @@ $code.=<<___;
 	ret
 .size	RC4,.-RC4
 ___
+}
 
 $idx="%r8";
 $ido="%r9";
 
 $code.=<<___;
-.extern	OPENSSL_ia32cap_P
-.globl	RC4_set_key
-.type	RC4_set_key,\@function,3
+.globl	private_RC4_set_key
+.type	private_RC4_set_key,\@function,3
 .align	16
-RC4_set_key:
+private_RC4_set_key:
 	lea	8($dat),$dat
 	lea	($inp,$len),$inp
 	neg	$len
@@ -280,12 +443,9 @@ RC4_set_key:
 	xor	%r11,%r11
 
 	mov	OPENSSL_ia32cap_P(%rip),$idx#d
-	bt	\$20,$idx#d
-	jnc	.Lw1stloop
-	bt	\$30,$idx#d
-	setc	$ido#b
-	mov	$ido#d,260($dat)
-	jmp	.Lc1stloop
+	bt	\$20,$idx#d	# RC4_CHAR?
+	jc	.Lc1stloop
+	jmp	.Lw1stloop
 
 .align	16
 .Lw1stloop:
@@ -339,7 +499,7 @@ RC4_set_key:
 	mov	%eax,-8($dat)
 	mov	%eax,-4($dat)
 	ret
-.size	RC4_set_key,.-RC4_set_key
+.size	private_RC4_set_key,.-private_RC4_set_key
 
 .globl	RC4_options
 .type	RC4_options,\@abi-omnipotent
@@ -348,18 +508,20 @@ RC4_options:
 	lea	.Lopts(%rip),%rax
 	mov	OPENSSL_ia32cap_P(%rip),%edx
 	bt	\$20,%edx
-	jnc	.Ldone
-	add	\$12,%rax
+	jc	.L8xchar
 	bt	\$30,%edx
 	jnc	.Ldone
-	add	\$13,%rax
+	add	\$25,%rax
+	ret
+.L8xchar:
+	add	\$12,%rax
 .Ldone:
 	ret
 .align	64
 .Lopts:
 .asciz	"rc4(8x,int)"
 .asciz	"rc4(8x,char)"
-.asciz	"rc4(1x,char)"
+.asciz	"rc4(16x,int)"
 .asciz	"RC4 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 .size	RC4_options,.-RC4_options
@@ -482,22 +644,32 @@ key_se_handler:
 	.rva	.LSEH_end_RC4
 	.rva	.LSEH_info_RC4
 
-	.rva	.LSEH_begin_RC4_set_key
-	.rva	.LSEH_end_RC4_set_key
-	.rva	.LSEH_info_RC4_set_key
+	.rva	.LSEH_begin_private_RC4_set_key
+	.rva	.LSEH_end_private_RC4_set_key
+	.rva	.LSEH_info_private_RC4_set_key
 
 .section	.xdata
 .align	8
 .LSEH_info_RC4:
 	.byte	9,0,0,0
 	.rva	stream_se_handler
-.LSEH_info_RC4_set_key:
+.LSEH_info_private_RC4_set_key:
 	.byte	9,0,0,0
 	.rva	key_se_handler
 ___
 }
 
-$code =~ s/#([bwd])/$1/gm;
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
 
 print $code;
 
diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h
index 29d1acccf5..88ceb46bc5 100644
--- a/src/lib/libcrypto/rc4/rc4.h
+++ b/src/lib/libcrypto/rc4/rc4.h
@@ -79,6 +79,7 @@ typedef struct rc4_key_st
  
 const char *RC4_options(void);
 void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data);
 void RC4(RC4_KEY *key, size_t len, const unsigned char *indata,
 		unsigned char *outdata);
 
diff --git a/src/lib/libcrypto/rc4/rc4_skey.c b/src/lib/libcrypto/rc4/rc4_skey.c
index b22c40b0bd..fda27636e7 100644
--- a/src/lib/libcrypto/rc4/rc4_skey.c
+++ b/src/lib/libcrypto/rc4/rc4_skey.c
@@ -85,7 +85,7 @@ const char *RC4_options(void)
  * Date: Wed, 14 Sep 1994 06:35:31 GMT
  */
 
-void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
+void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 	{
         register RC4_INT tmp;
         register int id1,id2;
@@ -104,40 +104,6 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
 		d[(n)]=d[id2]; \
 		d[id2]=tmp; }
 
-#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
-# if	defined(__i386)   || defined(__i386__)   || defined(_M_IX86) || \
-	defined(__INTEL__) || \
-	defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
-	if (sizeof(RC4_INT) > 1) {
-		/*
-		 * Unlike all other x86 [and x86_64] implementations,
-		 * Intel P4 core [including EM64T] was found to perform
-		 * poorly with wider RC4_INT. Performance improvement
-		 * for IA-32 hand-coded assembler turned out to be 2.8x
-		 * if re-coded for RC4_CHAR! It's however inappropriate
-		 * to just switch to RC4_CHAR for x86[_64], as non-P4
-		 * implementations suffer from significant performance
-		 * losses then, e.g. PIII exhibits >2x deterioration,
-		 * and so does Opteron. In order to assure optimal
-		 * all-round performance, let us [try to] detect P4 at
-		 * run-time by checking upon HTT bit in CPU capability
-		 * vector and set up compressed key schedule, which is
-		 * recognized by correspondingly updated assembler
-		 * module...
-		 *				<appro@fy.chalmers.se>
-		 */
-		if (OPENSSL_ia32cap_P & (1<<28)) {
-			unsigned char *cp=(unsigned char *)d;
-
-			for (i=0;i<256;i++) cp[i]=i;
-			for (i=0;i<256;i++) SK_LOOP(cp,i);
-			/* mark schedule as compressed! */
-			d[256/sizeof(RC4_INT)]=-1;
-			return;
-		}
-	}
-# endif
-#endif
 	for (i=0; i < 256; i++) d[i]=i;
 	for (i=0; i < 256; i+=4)
 		{
diff --git a/src/lib/libcrypto/ripemd/ripemd.h b/src/lib/libcrypto/ripemd/ripemd.h
index 5942eb6180..189bd8c90e 100644
--- a/src/lib/libcrypto/ripemd/ripemd.h
+++ b/src/lib/libcrypto/ripemd/ripemd.h
@@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st
 	unsigned int   num;
 	} RIPEMD160_CTX;
 
+#ifdef OPENSSL_FIPS
+int private_RIPEMD160_Init(RIPEMD160_CTX *c);
+#endif
 int RIPEMD160_Init(RIPEMD160_CTX *c);
 int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len);
 int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c);
diff --git a/src/lib/libcrypto/ripemd/rmd_dgst.c b/src/lib/libcrypto/ripemd/rmd_dgst.c
index 59b017f8c0..63f0d983f7 100644
--- a/src/lib/libcrypto/ripemd/rmd_dgst.c
+++ b/src/lib/libcrypto/ripemd/rmd_dgst.c
@@ -59,6 +59,7 @@
 #include <stdio.h>
 #include "rmd_locl.h"
 #include <openssl/opensslv.h>
+#include <openssl/crypto.h>
 
 const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
 
@@ -69,7 +70,7 @@ const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT;
      void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num);
 #  endif
 
-int RIPEMD160_Init(RIPEMD160_CTX *c)
+fips_md_init(RIPEMD160)
 	{
 	memset (c,0,sizeof(*c));
 	c->A=RIPEMD160_A;
diff --git a/src/lib/libcrypto/rsa/rsa.h b/src/lib/libcrypto/rsa/rsa.h
index cf74343657..4814a2fc15 100644
--- a/src/lib/libcrypto/rsa/rsa.h
+++ b/src/lib/libcrypto/rsa/rsa.h
@@ -222,12 +222,22 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \
 				pad, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \
+				EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad)
+
 #define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
 				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
 				EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \
 				len, NULL)
 
+#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \
+	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \
+				(EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \
+				EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \
+				0, plen)
+
 #define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL)
@@ -236,11 +246,24 @@ struct rsa_st
 	EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \
 				EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp)
 
+#define	 EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md)
+
+#define	 EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd)	\
+		EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG,  \
+				EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd)
+
 #define EVP_PKEY_CTRL_RSA_PADDING	(EVP_PKEY_ALG_CTRL + 1)
 #define EVP_PKEY_CTRL_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 2)
 
 #define EVP_PKEY_CTRL_RSA_KEYGEN_BITS	(EVP_PKEY_ALG_CTRL + 3)
 #define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP	(EVP_PKEY_ALG_CTRL + 4)
+#define EVP_PKEY_CTRL_RSA_MGF1_MD	(EVP_PKEY_ALG_CTRL + 5)
+
+#define EVP_PKEY_CTRL_GET_RSA_PADDING		(EVP_PKEY_ALG_CTRL + 6)
+#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN	(EVP_PKEY_ALG_CTRL + 7)
+#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD		(EVP_PKEY_ALG_CTRL + 8)
 
 #define RSA_PKCS1_PADDING	1
 #define RSA_SSLV23_PADDING	2
@@ -300,6 +323,16 @@ const RSA_METHOD *RSA_null_method(void);
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey)
 DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey)
 
+typedef struct rsa_pss_params_st
+	{
+	X509_ALGOR *hashAlgorithm;
+	X509_ALGOR *maskGenAlgorithm;
+	ASN1_INTEGER *saltLength;
+	ASN1_INTEGER *trailerField;
+	} RSA_PSS_PARAMS;
+
+DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 #ifndef OPENSSL_NO_FP_API
 int	RSA_print_fp(FILE *fp, const RSA *r,int offset);
 #endif
@@ -380,6 +413,14 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen);
 
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, 
+			const unsigned char *EM, int sLen);
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen);
+
 int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func,
 	CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func);
 int RSA_set_ex_data(RSA *r,int idx,void *arg);
@@ -388,6 +429,25 @@ void *RSA_get_ex_data(const RSA *r, int idx);
 RSA *RSAPublicKey_dup(RSA *rsa);
 RSA *RSAPrivateKey_dup(RSA *rsa);
 
+/* If this flag is set the RSA method is FIPS compliant and can be used
+ * in FIPS mode. This is set in the validated module method. If an
+ * application sets this flag in its own methods it is its responsibility
+ * to ensure the result is compliant.
+ */
+
+#define RSA_FLAG_FIPS_METHOD			0x0400
+
+/* If this flag is set the operations normally disabled in FIPS mode are
+ * permitted it is then the applications responsibility to ensure that the
+ * usage is compliant.
+ */
+
+#define RSA_FLAG_NON_FIPS_ALLOW			0x0400
+/* Application has decided PRNG is good enough to generate a key: don't
+ * check.
+ */
+#define RSA_FLAG_CHECKED			0x0800
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -405,6 +465,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_PKEY_RSA_CTRL				 143
 #define RSA_F_PKEY_RSA_CTRL_STR				 144
 #define RSA_F_PKEY_RSA_SIGN				 142
+#define RSA_F_PKEY_RSA_VERIFY				 154
 #define RSA_F_PKEY_RSA_VERIFYRECOVER			 141
 #define RSA_F_RSA_BUILTIN_KEYGEN			 129
 #define RSA_F_RSA_CHECK_KEY				 123
@@ -413,6 +474,8 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_EAY_PUBLIC_DECRYPT			 103
 #define RSA_F_RSA_EAY_PUBLIC_ENCRYPT			 104
 #define RSA_F_RSA_GENERATE_KEY				 105
+#define RSA_F_RSA_GENERATE_KEY_EX			 155
+#define RSA_F_RSA_ITEM_VERIFY				 156
 #define RSA_F_RSA_MEMORY_LOCK				 130
 #define RSA_F_RSA_NEW_METHOD				 106
 #define RSA_F_RSA_NULL					 124
@@ -424,6 +487,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_ADD_NONE			 107
 #define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP		 121
 #define RSA_F_RSA_PADDING_ADD_PKCS1_PSS			 125
+#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1		 148
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1		 108
 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2		 109
 #define RSA_F_RSA_PADDING_ADD_SSLV23			 110
@@ -436,8 +500,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_PADDING_CHECK_X931			 128
 #define RSA_F_RSA_PRINT					 115
 #define RSA_F_RSA_PRINT_FP				 116
+#define RSA_F_RSA_PRIVATE_DECRYPT			 150
+#define RSA_F_RSA_PRIVATE_ENCRYPT			 151
 #define RSA_F_RSA_PRIV_DECODE				 137
 #define RSA_F_RSA_PRIV_ENCODE				 138
+#define RSA_F_RSA_PUBLIC_DECRYPT			 152
+#define RSA_F_RSA_PUBLIC_ENCRYPT			 153
 #define RSA_F_RSA_PUB_DECODE				 139
 #define RSA_F_RSA_SETUP_BLINDING			 136
 #define RSA_F_RSA_SIGN					 117
@@ -445,6 +513,7 @@ void ERR_load_RSA_strings(void);
 #define RSA_F_RSA_VERIFY				 119
 #define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING		 120
 #define RSA_F_RSA_VERIFY_PKCS1_PSS			 126
+#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1			 149
 
 /* Reason codes. */
 #define RSA_R_ALGORITHM_MISMATCH			 100
@@ -470,19 +539,24 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_INVALID_HEADER				 137
 #define RSA_R_INVALID_KEYBITS				 145
 #define RSA_R_INVALID_MESSAGE_LENGTH			 131
+#define RSA_R_INVALID_MGF1_MD				 156
 #define RSA_R_INVALID_PADDING				 138
 #define RSA_R_INVALID_PADDING_MODE			 141
+#define RSA_R_INVALID_PSS_PARAMETERS			 149
 #define RSA_R_INVALID_PSS_SALTLEN			 146
+#define RSA_R_INVALID_SALT_LENGTH			 150
 #define RSA_R_INVALID_TRAILER				 139
 #define RSA_R_INVALID_X931_DIGEST			 142
 #define RSA_R_IQMP_NOT_INVERSE_OF_Q			 126
 #define RSA_R_KEY_SIZE_TOO_SMALL			 120
 #define RSA_R_LAST_OCTET_INVALID			 134
 #define RSA_R_MODULUS_TOO_LARGE				 105
+#define RSA_R_NON_FIPS_RSA_METHOD			 157
 #define RSA_R_NO_PUBLIC_EXPONENT			 140
 #define RSA_R_NULL_BEFORE_BLOCK_MISSING			 113
 #define RSA_R_N_DOES_NOT_EQUAL_P_Q			 127
 #define RSA_R_OAEP_DECODING_ERROR			 121
+#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE	 158
 #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE	 148
 #define RSA_R_PADDING_CHECK_FAILED			 114
 #define RSA_R_P_NOT_PRIME				 128
@@ -493,7 +567,12 @@ void ERR_load_RSA_strings(void);
 #define RSA_R_SSLV3_ROLLBACK_ATTACK			 115
 #define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116
 #define RSA_R_UNKNOWN_ALGORITHM_TYPE			 117
+#define RSA_R_UNKNOWN_MASK_DIGEST			 151
 #define RSA_R_UNKNOWN_PADDING_TYPE			 118
+#define RSA_R_UNKNOWN_PSS_DIGEST			 152
+#define RSA_R_UNSUPPORTED_MASK_ALGORITHM		 153
+#define RSA_R_UNSUPPORTED_MASK_PARAMETER		 154
+#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE		 155
 #define RSA_R_VALUE_MISSING				 147
 #define RSA_R_WRONG_SIGNATURE_LENGTH			 119
 
diff --git a/src/lib/libcrypto/rsa/rsa_ameth.c b/src/lib/libcrypto/rsa/rsa_ameth.c
index 8c3209885e..2460910ab2 100644
--- a/src/lib/libcrypto/rsa/rsa_ameth.c
+++ b/src/lib/libcrypto/rsa/rsa_ameth.c
@@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent,
 	return do_rsa_print(bp, pkey->pkey.rsa, indent, 1);
 	}
 
+static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg,
+					X509_ALGOR **pmaskHash)
+	{
+	const unsigned char *p;
+	int plen;
+	RSA_PSS_PARAMS *pss;
+
+	*pmaskHash = NULL;
+
+	if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE)
+		return NULL;
+	p = alg->parameter->value.sequence->data;
+	plen = alg->parameter->value.sequence->length;
+	pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen);
+
+	if (!pss)
+		return NULL;
+	
+	if (pss->maskGenAlgorithm)
+		{
+		ASN1_TYPE *param = pss->maskGenAlgorithm->parameter;
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1
+			&& param->type == V_ASN1_SEQUENCE)
+			{
+			p = param->value.sequence->data;
+			plen = param->value.sequence->length;
+			*pmaskHash = d2i_X509_ALGOR(NULL, &p, plen);
+			}
+		}
+
+	return pss;
+	}
+
+static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, 
+				X509_ALGOR *maskHash, int indent)
+	{
+	int rv = 0;
+	if (!pss)
+		{
+		if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0)
+			return 0;
+		return 1;
+		}
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Hash Algorithm: ") <= 0)
+		goto err;
+
+	if (pss->hashAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "sha1 (default)") <= 0)
+		goto err;
+
+	if (BIO_puts(bp, "\n") <= 0)
+		goto err;
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+
+	if (BIO_puts(bp, "Mask Algorithm: ") <= 0)
+			goto err;
+	if (pss->maskGenAlgorithm)
+		{
+		if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0)
+			goto err;
+		if (BIO_puts(bp, " with ") <= 0)
+			goto err;
+		if (maskHash)
+			{
+			if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0)
+			goto err;
+			}
+		else if (BIO_puts(bp, "INVALID") <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Salt Length: ") <= 0)
+			goto err;
+	if (pss->saltLength)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "20 (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+
+	if (!BIO_indent(bp, indent, 128))
+		goto err;
+	if (BIO_puts(bp, "Trailer Field: ") <= 0)
+			goto err;
+	if (pss->trailerField)
+		{
+		if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0)
+			goto err;
+		}
+	else if (BIO_puts(bp, "0xbc (default)") <= 0)
+		goto err;
+	BIO_puts(bp, "\n");
+	
+	rv = 1;
+
+	err:
+	return rv;
+
+	}
+
+static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg,
+					const ASN1_STRING *sig,
+					int indent, ASN1_PCTX *pctx)
+	{
+	if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss)
+		{
+		int rv;
+		RSA_PSS_PARAMS *pss;
+		X509_ALGOR *maskHash;
+		pss = rsa_pss_decode(sigalg, &maskHash);
+		rv = rsa_pss_param_print(bp, pss, maskHash, indent);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (maskHash)
+			X509_ALGOR_free(maskHash);
+		if (!rv)
+			return 0;
+		}
+	else if (!sig && BIO_puts(bp, "\n") <= 0)
+		return 0;
+	if (sig)
+		return X509_signature_dump(bp, sig, indent);
+	return 1;
+	}
 
 static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 	{
@@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2)
 
 	}
 
+/* Customised RSA item verification routine. This is called 
+ * when a signature is encountered requiring special handling. We 
+ * currently only handle PSS.
+ */
+
+
+static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+			X509_ALGOR *sigalg, ASN1_BIT_STRING *sig,
+			EVP_PKEY *pkey)
+	{
+	int rv = -1;
+	int saltlen;
+	const EVP_MD *mgf1md = NULL, *md = NULL;
+	RSA_PSS_PARAMS *pss;
+	X509_ALGOR *maskHash;
+	EVP_PKEY_CTX *pkctx;
+	/* Sanity check: make sure it is PSS */
+	if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE);
+		return -1;
+		}
+	/* Decode PSS parameters */
+	pss = rsa_pss_decode(sigalg, &maskHash);
+
+	if (pss == NULL)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS);
+		goto err;
+		}
+	/* Check mask and lookup mask hash algorithm */
+	if (pss->maskGenAlgorithm)
+		{
+		if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM);
+			goto err;
+			}
+		if (!maskHash)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER);
+			goto err;
+			}
+		mgf1md = EVP_get_digestbyobj(maskHash->algorithm);
+		if (mgf1md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST);
+			goto err;
+			}
+		}
+	else
+		mgf1md = EVP_sha1();
+
+	if (pss->hashAlgorithm)
+		{
+		md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm);
+		if (md == NULL)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST);
+			goto err;
+			}
+		}
+	else
+		md = EVP_sha1();
+
+	if (pss->saltLength)
+		{
+		saltlen = ASN1_INTEGER_get(pss->saltLength);
+
+		/* Could perform more salt length sanity checks but the main
+		 * RSA routines will trap other invalid values anyway.
+		 */
+		if (saltlen < 0)
+			{
+			RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH);
+			goto err;
+			}
+		}
+	else
+		saltlen = 20;
+
+	/* low-level routines support only trailer field 0xbc (value 1)
+	 * and PKCS#1 says we should reject any other value anyway.
+	 */
+	if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1)
+		{
+		RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER);
+		goto err;
+		}
+
+	/* We have all parameters now set up context */
+
+	if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey))
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0)
+		goto err;
+
+	if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0)
+		goto err;
+	/* Carry on */
+	rv = 2;
+
+	err:
+	RSA_PSS_PARAMS_free(pss);
+	if (maskHash)
+		X509_ALGOR_free(maskHash);
+	return rv;
+	}
+
+static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn,
+				X509_ALGOR *alg1, X509_ALGOR *alg2, 
+				ASN1_BIT_STRING *sig)
+	{
+	int pad_mode;
+	EVP_PKEY_CTX *pkctx = ctx->pctx;
+	if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0)
+		return 0;
+	if (pad_mode == RSA_PKCS1_PADDING)
+		return 2;
+	if (pad_mode == RSA_PKCS1_PSS_PADDING)
+		{
+		const EVP_MD *sigmd, *mgf1md;
+		RSA_PSS_PARAMS *pss = NULL;
+		X509_ALGOR *mgf1alg = NULL;
+		ASN1_STRING *os1 = NULL, *os2 = NULL;
+		EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx);
+		int saltlen, rv = 0;
+		sigmd = EVP_MD_CTX_md(ctx);
+		if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0)
+			goto err;
+		if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen))
+			goto err;
+		if (saltlen == -1)
+			saltlen = EVP_MD_size(sigmd);
+		else if (saltlen == -2)
+			{
+			saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2;
+			if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0)
+				saltlen--;
+			}
+		pss = RSA_PSS_PARAMS_new();
+		if (!pss)
+			goto err;
+		if (saltlen != 20)
+			{
+			pss->saltLength = ASN1_INTEGER_new();
+			if (!pss->saltLength)
+				goto err;
+			if (!ASN1_INTEGER_set(pss->saltLength, saltlen))
+				goto err;
+			}
+		if (EVP_MD_type(sigmd) != NID_sha1)
+			{
+			pss->hashAlgorithm = X509_ALGOR_new();
+			if (!pss->hashAlgorithm)
+				goto err;
+			X509_ALGOR_set_md(pss->hashAlgorithm, sigmd);
+			}
+		if (EVP_MD_type(mgf1md) != NID_sha1)
+			{
+			ASN1_STRING *stmp = NULL;
+			/* need to embed algorithm ID inside another */
+			mgf1alg = X509_ALGOR_new();
+			X509_ALGOR_set_md(mgf1alg, mgf1md);
+			if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR),
+									&stmp))
+					goto err;
+			pss->maskGenAlgorithm = X509_ALGOR_new();
+			if (!pss->maskGenAlgorithm)
+				goto err;
+			X509_ALGOR_set0(pss->maskGenAlgorithm,
+					OBJ_nid2obj(NID_mgf1),
+					V_ASN1_SEQUENCE, stmp);
+			}
+		/* Finally create string with pss parameter encoding. */
+		if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1))
+			goto err;
+		if (alg2)
+			{
+			os2 = ASN1_STRING_dup(os1);
+			if (!os2)
+				goto err;
+			X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss),
+						V_ASN1_SEQUENCE, os2);
+			}
+		X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss),
+					V_ASN1_SEQUENCE, os1);
+		os1 = os2 = NULL;
+		rv = 3;
+		err:
+		if (mgf1alg)
+			X509_ALGOR_free(mgf1alg);
+		if (pss)
+			RSA_PSS_PARAMS_free(pss);
+		if (os1)
+			ASN1_STRING_free(os1);
+		return rv;
+		
+		}
+	return 2;
+	}
 
 const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = 
 	{
@@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] =
 
 		0,0,0,0,0,0,
 
+		rsa_sig_print,
 		int_rsa_free,
 		rsa_pkey_ctrl,
 		old_rsa_priv_decode,
-		old_rsa_priv_encode
+		old_rsa_priv_encode,
+		rsa_item_verify,
+		rsa_item_sign
 		},
 
 		{
diff --git a/src/lib/libcrypto/rsa/rsa_asn1.c b/src/lib/libcrypto/rsa/rsa_asn1.c
index 4efca8cdc8..6ed5de3db4 100644
--- a/src/lib/libcrypto/rsa/rsa_asn1.c
+++ b/src/lib/libcrypto/rsa/rsa_asn1.c
@@ -60,6 +60,7 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#include <openssl/x509.h>
 #include <openssl/asn1t.h>
 
 /* Override the default free and new methods */
@@ -96,6 +97,15 @@ ASN1_SEQUENCE_cb(RSAPublicKey, rsa_cb) = {
 	ASN1_SIMPLE(RSA, e, BIGNUM),
 } ASN1_SEQUENCE_END_cb(RSA, RSAPublicKey)
 
+ASN1_SEQUENCE(RSA_PSS_PARAMS) = {
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR,0),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR,1),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER,2),
+	ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER,3)
+} ASN1_SEQUENCE_END(RSA_PSS_PARAMS)
+
+IMPLEMENT_ASN1_FUNCTIONS(RSA_PSS_PARAMS)
+
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPrivateKey, RSAPrivateKey)
 
 IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPublicKey, RSAPublicKey)
diff --git a/src/lib/libcrypto/rsa/rsa_crpt.c b/src/lib/libcrypto/rsa/rsa_crpt.c
new file mode 100644
index 0000000000..d3e44785dc
--- /dev/null
+++ b/src/lib/libcrypto/rsa/rsa_crpt.c
@@ -0,0 +1,257 @@
+/* crypto/rsa/rsa_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
+#include <stdio.h>
+#include <openssl/crypto.h>
+#include "cryptlib.h"
+#include <openssl/lhash.h>
+#include <openssl/bn.h>
+#include <openssl/rsa.h>
+#include <openssl/rand.h>
+#ifndef OPENSSL_NO_ENGINE
+#include <openssl/engine.h>
+#endif
+
+int RSA_size(const RSA *r)
+	{
+	return(BN_num_bytes(r->n));
+	}
+
+int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
+	}
+
+int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
+	     RSA *rsa, int padding)
+	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD);
+		return -1;
+		}
+#endif
+	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
+	}
+
+int RSA_flags(const RSA *r)
+	{
+	return((r == NULL)?0:r->meth->flags);
+	}
+
+void RSA_blinding_off(RSA *rsa)
+	{
+	if (rsa->blinding != NULL)
+		{
+		BN_BLINDING_free(rsa->blinding);
+		rsa->blinding=NULL;
+		}
+	rsa->flags &= ~RSA_FLAG_BLINDING;
+	rsa->flags |= RSA_FLAG_NO_BLINDING;
+	}
+
+int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
+	{
+	int ret=0;
+
+	if (rsa->blinding != NULL)
+		RSA_blinding_off(rsa);
+
+	rsa->blinding = RSA_setup_blinding(rsa, ctx);
+	if (rsa->blinding == NULL)
+		goto err;
+
+	rsa->flags |= RSA_FLAG_BLINDING;
+	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
+	ret=1;
+err:
+	return(ret);
+	}
+
+static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
+	const BIGNUM *q, BN_CTX *ctx)
+{
+	BIGNUM *ret = NULL, *r0, *r1, *r2;
+
+	if (d == NULL || p == NULL || q == NULL)
+		return NULL;
+
+	BN_CTX_start(ctx);
+	r0 = BN_CTX_get(ctx);
+	r1 = BN_CTX_get(ctx);
+	r2 = BN_CTX_get(ctx);
+	if (r2 == NULL)
+		goto err;
+
+	if (!BN_sub(r1, p, BN_value_one())) goto err;
+	if (!BN_sub(r2, q, BN_value_one())) goto err;
+	if (!BN_mul(r0, r1, r2, ctx)) goto err;
+
+	ret = BN_mod_inverse(NULL, d, r0, ctx);
+err:
+	BN_CTX_end(ctx);
+	return ret;
+}
+
+BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
+{
+	BIGNUM local_n;
+	BIGNUM *e,*n;
+	BN_CTX *ctx;
+	BN_BLINDING *ret = NULL;
+
+	if (in_ctx == NULL)
+		{
+		if ((ctx = BN_CTX_new()) == NULL) return 0;
+		}
+	else
+		ctx = in_ctx;
+
+	BN_CTX_start(ctx);
+	e  = BN_CTX_get(ctx);
+	if (e == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
+		goto err;
+		}
+
+	if (rsa->e == NULL)
+		{
+		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
+		if (e == NULL)
+			{
+			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
+			goto err;
+			}
+		}
+	else
+		e = rsa->e;
+
+	
+	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
+		{
+		/* if PRNG is not properly seeded, resort to secret
+		 * exponent as unpredictable seed */
+		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
+		}
+
+	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
+		{
+		/* Set BN_FLG_CONSTTIME flag */
+		n = &local_n;
+		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
+		}
+	else
+		n = rsa->n;
+
+	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
+			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
+	if (ret == NULL)
+		{
+		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
+		goto err;
+		}
+	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
+err:
+	BN_CTX_end(ctx);
+	if (in_ctx == NULL)
+		BN_CTX_free(ctx);
+	if(rsa->e == NULL)
+		BN_free(e);
+
+	return ret;
+}
diff --git a/src/lib/libcrypto/rsa/rsa_err.c b/src/lib/libcrypto/rsa/rsa_err.c
index cf9f1106b0..46e0bf9980 100644
--- a/src/lib/libcrypto/rsa/rsa_err.c
+++ b/src/lib/libcrypto/rsa/rsa_err.c
@@ -1,6 +1,6 @@
 /* crypto/rsa/rsa_err.c */
 /* ====================================================================
- * Copyright (c) 1999-2008 The OpenSSL Project.  All rights reserved.
+ * Copyright (c) 1999-2011 The OpenSSL Project.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -78,6 +78,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL),	"PKEY_RSA_CTRL"},
 {ERR_FUNC(RSA_F_PKEY_RSA_CTRL_STR),	"PKEY_RSA_CTRL_STR"},
 {ERR_FUNC(RSA_F_PKEY_RSA_SIGN),	"PKEY_RSA_SIGN"},
+{ERR_FUNC(RSA_F_PKEY_RSA_VERIFY),	"PKEY_RSA_VERIFY"},
 {ERR_FUNC(RSA_F_PKEY_RSA_VERIFYRECOVER),	"PKEY_RSA_VERIFYRECOVER"},
 {ERR_FUNC(RSA_F_RSA_BUILTIN_KEYGEN),	"RSA_BUILTIN_KEYGEN"},
 {ERR_FUNC(RSA_F_RSA_CHECK_KEY),	"RSA_check_key"},
@@ -86,6 +87,8 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_DECRYPT),	"RSA_EAY_PUBLIC_DECRYPT"},
 {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_ENCRYPT),	"RSA_EAY_PUBLIC_ENCRYPT"},
 {ERR_FUNC(RSA_F_RSA_GENERATE_KEY),	"RSA_generate_key"},
+{ERR_FUNC(RSA_F_RSA_GENERATE_KEY_EX),	"RSA_generate_key_ex"},
+{ERR_FUNC(RSA_F_RSA_ITEM_VERIFY),	"RSA_ITEM_VERIFY"},
 {ERR_FUNC(RSA_F_RSA_MEMORY_LOCK),	"RSA_memory_lock"},
 {ERR_FUNC(RSA_F_RSA_NEW_METHOD),	"RSA_new_method"},
 {ERR_FUNC(RSA_F_RSA_NULL),	"RSA_NULL"},
@@ -97,6 +100,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_NONE),	"RSA_padding_add_none"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP),	"RSA_padding_add_PKCS1_OAEP"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS),	"RSA_padding_add_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1),	"RSA_padding_add_PKCS1_PSS_mgf1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1),	"RSA_padding_add_PKCS1_type_1"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2),	"RSA_padding_add_PKCS1_type_2"},
 {ERR_FUNC(RSA_F_RSA_PADDING_ADD_SSLV23),	"RSA_padding_add_SSLv23"},
@@ -109,8 +113,12 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_PADDING_CHECK_X931),	"RSA_padding_check_X931"},
 {ERR_FUNC(RSA_F_RSA_PRINT),	"RSA_print"},
 {ERR_FUNC(RSA_F_RSA_PRINT_FP),	"RSA_print_fp"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_DECRYPT),	"RSA_private_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PRIVATE_ENCRYPT),	"RSA_private_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PRIV_DECODE),	"RSA_PRIV_DECODE"},
 {ERR_FUNC(RSA_F_RSA_PRIV_ENCODE),	"RSA_PRIV_ENCODE"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_DECRYPT),	"RSA_public_decrypt"},
+{ERR_FUNC(RSA_F_RSA_PUBLIC_ENCRYPT),	"RSA_public_encrypt"},
 {ERR_FUNC(RSA_F_RSA_PUB_DECODE),	"RSA_PUB_DECODE"},
 {ERR_FUNC(RSA_F_RSA_SETUP_BLINDING),	"RSA_setup_blinding"},
 {ERR_FUNC(RSA_F_RSA_SIGN),	"RSA_sign"},
@@ -118,6 +126,7 @@ static ERR_STRING_DATA RSA_str_functs[]=
 {ERR_FUNC(RSA_F_RSA_VERIFY),	"RSA_verify"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_ASN1_OCTET_STRING),	"RSA_verify_ASN1_OCTET_STRING"},
 {ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS),	"RSA_verify_PKCS1_PSS"},
+{ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1),	"RSA_verify_PKCS1_PSS_mgf1"},
 {0,NULL}
 	};
 
@@ -146,19 +155,24 @@ static ERR_STRING_DATA RSA_str_reasons[]=
 {ERR_REASON(RSA_R_INVALID_HEADER)        ,"invalid header"},
 {ERR_REASON(RSA_R_INVALID_KEYBITS)       ,"invalid keybits"},
 {ERR_REASON(RSA_R_INVALID_MESSAGE_LENGTH),"invalid message length"},
+{ERR_REASON(RSA_R_INVALID_MGF1_MD)       ,"invalid mgf1 md"},
 {ERR_REASON(RSA_R_INVALID_PADDING)       ,"invalid padding"},
 {ERR_REASON(RSA_R_INVALID_PADDING_MODE)  ,"invalid padding mode"},
+{ERR_REASON(RSA_R_INVALID_PSS_PARAMETERS),"invalid pss parameters"},
 {ERR_REASON(RSA_R_INVALID_PSS_SALTLEN)   ,"invalid pss saltlen"},
+{ERR_REASON(RSA_R_INVALID_SALT_LENGTH)   ,"invalid salt length"},
 {ERR_REASON(RSA_R_INVALID_TRAILER)       ,"invalid trailer"},
 {ERR_REASON(RSA_R_INVALID_X931_DIGEST)   ,"invalid x931 digest"},
 {ERR_REASON(RSA_R_IQMP_NOT_INVERSE_OF_Q) ,"iqmp not inverse of q"},
 {ERR_REASON(RSA_R_KEY_SIZE_TOO_SMALL)    ,"key size too small"},
 {ERR_REASON(RSA_R_LAST_OCTET_INVALID)    ,"last octet invalid"},
 {ERR_REASON(RSA_R_MODULUS_TOO_LARGE)     ,"modulus too large"},
+{ERR_REASON(RSA_R_NON_FIPS_RSA_METHOD)   ,"non fips rsa method"},
 {ERR_REASON(RSA_R_NO_PUBLIC_EXPONENT)    ,"no public exponent"},
 {ERR_REASON(RSA_R_NULL_BEFORE_BLOCK_MISSING),"null before block missing"},
 {ERR_REASON(RSA_R_N_DOES_NOT_EQUAL_P_Q)  ,"n does not equal p q"},
 {ERR_REASON(RSA_R_OAEP_DECODING_ERROR)   ,"oaep decoding error"},
+{ERR_REASON(RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE),"operation not allowed in fips mode"},
 {ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"},
 {ERR_REASON(RSA_R_PADDING_CHECK_FAILED)  ,"padding check failed"},
 {ERR_REASON(RSA_R_P_NOT_PRIME)           ,"p not prime"},
@@ -169,7 +183,12 @@ static ERR_STRING_DATA RSA_str_reasons[]=
 {ERR_REASON(RSA_R_SSLV3_ROLLBACK_ATTACK) ,"sslv3 rollback attack"},
 {ERR_REASON(RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),"the asn1 object identifier is not known for this md"},
 {ERR_REASON(RSA_R_UNKNOWN_ALGORITHM_TYPE),"unknown algorithm type"},
+{ERR_REASON(RSA_R_UNKNOWN_MASK_DIGEST)   ,"unknown mask digest"},
 {ERR_REASON(RSA_R_UNKNOWN_PADDING_TYPE)  ,"unknown padding type"},
+{ERR_REASON(RSA_R_UNKNOWN_PSS_DIGEST)    ,"unknown pss digest"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_ALGORITHM),"unsupported mask algorithm"},
+{ERR_REASON(RSA_R_UNSUPPORTED_MASK_PARAMETER),"unsupported mask parameter"},
+{ERR_REASON(RSA_R_UNSUPPORTED_SIGNATURE_TYPE),"unsupported signature type"},
 {ERR_REASON(RSA_R_VALUE_MISSING)         ,"value missing"},
 {ERR_REASON(RSA_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
 {0,NULL}
diff --git a/src/lib/libcrypto/rsa/rsa_gen.c b/src/lib/libcrypto/rsa/rsa_gen.c
index 767f7ab682..42290cce66 100644
--- a/src/lib/libcrypto/rsa/rsa_gen.c
+++ b/src/lib/libcrypto/rsa/rsa_gen.c
@@ -67,6 +67,9 @@
 #include "cryptlib.h"
 #include <openssl/bn.h>
 #include <openssl/rsa.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb);
 
@@ -77,8 +80,20 @@ static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
  * now just because key-generation is part of RSA_METHOD. */
 int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb)
 	{
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_GENERATE_KEY_EX, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if(rsa->meth->rsa_keygen)
 		return rsa->meth->rsa_keygen(rsa, bits, e_value, cb);
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode())
+		return FIPS_rsa_generate_key_ex(rsa, bits, e_value, cb);
+#endif
 	return rsa_builtin_keygen(rsa, bits, e_value, cb);
 	}
 
diff --git a/src/lib/libcrypto/rsa/rsa_lib.c b/src/lib/libcrypto/rsa/rsa_lib.c
index de45088d76..c95ceafc82 100644
--- a/src/lib/libcrypto/rsa/rsa_lib.c
+++ b/src/lib/libcrypto/rsa/rsa_lib.c
@@ -67,6 +67,10 @@
 #include <openssl/engine.h>
 #endif
 
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
+
 const char RSA_version[]="RSA" OPENSSL_VERSION_PTEXT;
 
 static const RSA_METHOD *default_RSA_meth=NULL;
@@ -87,11 +91,14 @@ const RSA_METHOD *RSA_get_default_method(void)
 	{
 	if (default_RSA_meth == NULL)
 		{
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return FIPS_rsa_pkcs1_ssleay();
+		else
+			return RSA_PKCS1_SSLeay();
+#else
 #ifdef RSA_NULL
 		default_RSA_meth=RSA_null_method();
-#else
-#if 0 /* was: #ifdef RSAref */
-		default_RSA_meth=RSA_PKCS1_RSAref();
 #else
 		default_RSA_meth=RSA_PKCS1_SSLeay();
 #endif
@@ -181,7 +188,7 @@ RSA *RSA_new_method(ENGINE *engine)
 	ret->blinding=NULL;
 	ret->mt_blinding=NULL;
 	ret->bignum_data=NULL;
-	ret->flags=ret->meth->flags;
+	ret->flags=ret->meth->flags & ~RSA_FLAG_NON_FIPS_ALLOW;
 	if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_RSA, ret, &ret->ex_data))
 		{
 #ifndef OPENSSL_NO_ENGINE
@@ -280,163 +287,6 @@ void *RSA_get_ex_data(const RSA *r, int idx)
 	return(CRYPTO_get_ex_data(&r->ex_data,idx));
 	}
 
-int RSA_size(const RSA *r)
-	{
-	return(BN_num_bytes(r->n));
-	}
-
-int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding));
-	}
-
-int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to,
-	     RSA *rsa, int padding)
-	{
-	return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding));
-	}
-
-int RSA_flags(const RSA *r)
-	{
-	return((r == NULL)?0:r->meth->flags);
-	}
-
-void RSA_blinding_off(RSA *rsa)
-	{
-	if (rsa->blinding != NULL)
-		{
-		BN_BLINDING_free(rsa->blinding);
-		rsa->blinding=NULL;
-		}
-	rsa->flags &= ~RSA_FLAG_BLINDING;
-	rsa->flags |= RSA_FLAG_NO_BLINDING;
-	}
-
-int RSA_blinding_on(RSA *rsa, BN_CTX *ctx)
-	{
-	int ret=0;
-
-	if (rsa->blinding != NULL)
-		RSA_blinding_off(rsa);
-
-	rsa->blinding = RSA_setup_blinding(rsa, ctx);
-	if (rsa->blinding == NULL)
-		goto err;
-
-	rsa->flags |= RSA_FLAG_BLINDING;
-	rsa->flags &= ~RSA_FLAG_NO_BLINDING;
-	ret=1;
-err:
-	return(ret);
-	}
-
-static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p,
-	const BIGNUM *q, BN_CTX *ctx)
-{
-	BIGNUM *ret = NULL, *r0, *r1, *r2;
-
-	if (d == NULL || p == NULL || q == NULL)
-		return NULL;
-
-	BN_CTX_start(ctx);
-	r0 = BN_CTX_get(ctx);
-	r1 = BN_CTX_get(ctx);
-	r2 = BN_CTX_get(ctx);
-	if (r2 == NULL)
-		goto err;
-
-	if (!BN_sub(r1, p, BN_value_one())) goto err;
-	if (!BN_sub(r2, q, BN_value_one())) goto err;
-	if (!BN_mul(r0, r1, r2, ctx)) goto err;
-
-	ret = BN_mod_inverse(NULL, d, r0, ctx);
-err:
-	BN_CTX_end(ctx);
-	return ret;
-}
-
-BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx)
-{
-	BIGNUM local_n;
-	BIGNUM *e,*n;
-	BN_CTX *ctx;
-	BN_BLINDING *ret = NULL;
-
-	if (in_ctx == NULL)
-		{
-		if ((ctx = BN_CTX_new()) == NULL) return 0;
-		}
-	else
-		ctx = in_ctx;
-
-	BN_CTX_start(ctx);
-	e  = BN_CTX_get(ctx);
-	if (e == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE);
-		goto err;
-		}
-
-	if (rsa->e == NULL)
-		{
-		e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx);
-		if (e == NULL)
-			{
-			RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT);
-			goto err;
-			}
-		}
-	else
-		e = rsa->e;
-
-	
-	if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL)
-		{
-		/* if PRNG is not properly seeded, resort to secret
-		 * exponent as unpredictable seed */
-		RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0);
-		}
-
-	if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME))
-		{
-		/* Set BN_FLG_CONSTTIME flag */
-		n = &local_n;
-		BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME);
-		}
-	else
-		n = rsa->n;
-
-	ret = BN_BLINDING_create_param(NULL, e, n, ctx,
-			rsa->meth->bn_mod_exp, rsa->_method_mod_n);
-	if (ret == NULL)
-		{
-		RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB);
-		goto err;
-		}
-	CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret));
-err:
-	BN_CTX_end(ctx);
-	if (in_ctx == NULL)
-		BN_CTX_free(ctx);
-	if(rsa->e == NULL)
-		BN_free(e);
-
-	return ret;
-}
-
 int RSA_memory_lock(RSA *r)
 	{
 	int i,j,k,off;
diff --git a/src/lib/libcrypto/rsa/rsa_oaep.c b/src/lib/libcrypto/rsa/rsa_oaep.c
index 18d307ea9e..553d212ebe 100644
--- a/src/lib/libcrypto/rsa/rsa_oaep.c
+++ b/src/lib/libcrypto/rsa/rsa_oaep.c
@@ -56,7 +56,8 @@ int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen,
 	seed = to + 1;
 	db = to + SHA_DIGEST_LENGTH + 1;
 
-	EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL))
+		return 0;
 	memset(db + SHA_DIGEST_LENGTH, 0,
 		emlen - flen - 2 * SHA_DIGEST_LENGTH - 1);
 	db[emlen - flen - SHA_DIGEST_LENGTH - 1] = 0x01;
@@ -145,7 +146,8 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen,
 	for (i = 0; i < dblen; i++)
 		db[i] ^= maskeddb[i];
 
-	EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL))
+		return -1;
 
 	if (memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad)
 		goto decoding_err;
diff --git a/src/lib/libcrypto/rsa/rsa_pmeth.c b/src/lib/libcrypto/rsa/rsa_pmeth.c
index c6892ecd09..5b2ecf56ad 100644
--- a/src/lib/libcrypto/rsa/rsa_pmeth.c
+++ b/src/lib/libcrypto/rsa/rsa_pmeth.c
@@ -63,6 +63,12 @@
 #include <openssl/rsa.h>
 #include <openssl/bn.h>
 #include <openssl/evp.h>
+#ifndef OPENSSL_NO_CMS
+#include <openssl/cms.h>
+#endif
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #include "evp_locl.h"
 #include "rsa_locl.h"
 
@@ -79,6 +85,8 @@ typedef struct
 	int pad_mode;
 	/* message digest */
 	const EVP_MD *md;
+	/* message digest for MGF1 */
+	const EVP_MD *mgf1md;
 	/* PSS/OAEP salt length */
 	int saltlen;
 	/* Temp buffer */
@@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx)
 	rctx->pub_exp = NULL;
 	rctx->pad_mode = RSA_PKCS1_PADDING;
 	rctx->md = NULL;
+	rctx->mgf1md = NULL;
 	rctx->tbuf = NULL;
 
 	rctx->saltlen = -2;
@@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx)
 		OPENSSL_free(rctx);
 		}
 	}
+#ifdef OPENSSL_FIPS
+/* FIP checker. Return value indicates status of context parameters:
+ * 1  : redirect to FIPS.
+ * 0  : don't redirect to FIPS.
+ * -1 : illegal operation in FIPS mode.
+ */
+
+static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx)
+	{
+	RSA_PKEY_CTX *rctx = ctx->data;
+	RSA *rsa = ctx->pkey->pkey.rsa;
+	int rv = -1;
+	if (!FIPS_mode())
+		return 0;
+	if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)
+		rv = 0;
+	if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv)
+		return -1;
+	if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS))
+		return rv;
+	return 1;
+	}
+#endif
 
 static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 					const unsigned char *tbs, size_t tbslen)
@@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 
+#ifdef OPENSSL_FIPS
+	ret = pkey_fips_check_ctx(ctx);
+	if (ret < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
+
 	if (rctx->md)
 		{
 		if (tbslen != (size_t)EVP_MD_size(rctx->md))
@@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 					RSA_R_INVALID_DIGEST_LENGTH);
 			return -1;
 			}
-		if (rctx->pad_mode == RSA_X931_PADDING)
+#ifdef OPENSSL_FIPS
+		if (ret > 0)
+			{
+			unsigned int slen;
+			ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, &slen);
+			if (ret > 0)
+				*siglen = slen;
+			else
+				*siglen = 0;
+			return ret;
+			}
+#endif
+
+		if (EVP_MD_type(rctx->md) == NID_mdc2)
+			{
+			unsigned int sltmp;
+			if (rctx->pad_mode != RSA_PKCS1_PADDING)
+				return -1;
+			ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2,
+						tbs, tbslen, sig, &sltmp, rsa);
+
+			if (ret <= 0)
+				return ret;
+			ret = sltmp;
+			}
+		else if (rctx->pad_mode == RSA_X931_PADDING)
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
@@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen,
 			{
 			if (!setup_tbuf(rctx, ctx))
 				return -1;
-			if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs,
-						rctx->md, rctx->saltlen))
+			if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa,
+						rctx->tbuf, tbs,
+						rctx->md, rctx->mgf1md,
+						rctx->saltlen))
 				return -1;
 			ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf,
 						sig, rsa, RSA_NO_PADDING);
@@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
 	RSA_PKEY_CTX *rctx = ctx->data;
 	RSA *rsa = ctx->pkey->pkey.rsa;
 	size_t rslen;
+#ifdef OPENSSL_FIPS
+	int rv;
+	rv = pkey_fips_check_ctx(ctx);
+	if (rv < 0)
+		{
+		RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE);
+		return -1;
+		}
+#endif
 	if (rctx->md)
 		{
+#ifdef OPENSSL_FIPS
+		if (rv > 0)
+			{
+			return FIPS_rsa_verify_digest(rsa,
+							tbs, tbslen,
+							rctx->md,
+							rctx->pad_mode,
+							rctx->saltlen,
+							rctx->mgf1md,
+							sig, siglen);
+							
+			}
+#endif
 		if (rctx->pad_mode == RSA_PKCS1_PADDING)
 			return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen,
 					sig, siglen, rsa);
@@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx,
 							rsa, RSA_NO_PADDING);
 			if (ret <= 0)
 				return 0;
-			ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md,
+			ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs,
+						rctx->md, rctx->mgf1md,
 						rctx->tbuf, rctx->saltlen);
 			if (ret <= 0)
 				return 0;
@@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 				RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE);
 		return -2;
 
+		case EVP_PKEY_CTRL_GET_RSA_PADDING:
+		*(int *)p2 = rctx->pad_mode;
+		return 1;
+
 		case EVP_PKEY_CTRL_RSA_PSS_SALTLEN:
-		if (p1 < -2)
-			return -2;
+		case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN:
 		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
 			{
 			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN);
 			return -2;
 			}
-		rctx->saltlen = p1;
+		if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN)
+			*(int *)p2 = rctx->saltlen;
+		else
+			{
+			if (p1 < -2)
+				return -2;
+			rctx->saltlen = p1;
+			}
 		return 1;
 
 		case EVP_PKEY_CTRL_RSA_KEYGEN_BITS:
@@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2)
 		rctx->md = p2;
 		return 1;
 
+		case EVP_PKEY_CTRL_RSA_MGF1_MD:
+		case EVP_PKEY_CTRL_GET_RSA_MGF1_MD:
+		if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING)
+			{
+			RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD);
+			return -2;
+			}
+		if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD)
+			{
+			if (rctx->mgf1md)
+				*(const EVP_MD **)p2 = rctx->mgf1md;
+			else
+				*(const EVP_MD **)p2 = rctx->md;
+			}
+		else
+			rctx->mgf1md = p2;
+		return 1;
+
 		case EVP_PKEY_CTRL_DIGESTINIT:
 		case EVP_PKEY_CTRL_PKCS7_ENCRYPT:
 		case EVP_PKEY_CTRL_PKCS7_DECRYPT:
 		case EVP_PKEY_CTRL_PKCS7_SIGN:
+		return 1;
 #ifndef OPENSSL_NO_CMS
-		case EVP_PKEY_CTRL_CMS_ENCRYPT:
 		case EVP_PKEY_CTRL_CMS_DECRYPT:
+		{
+		X509_ALGOR *alg = NULL;
+		ASN1_OBJECT *encalg = NULL;
+		if (p2)
+			CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg);
+		if (alg)
+			X509_ALGOR_get0(&encalg, NULL, NULL, alg);
+		if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep)
+			rctx->pad_mode = RSA_PKCS1_OAEP_PADDING;
+		}
+		case EVP_PKEY_CTRL_CMS_ENCRYPT:
 		case EVP_PKEY_CTRL_CMS_SIGN:
-#endif
 		return 1;
+#endif
 		case EVP_PKEY_CTRL_PEER_KEY:
 			RSAerr(RSA_F_PKEY_RSA_CTRL,
 			RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE);
diff --git a/src/lib/libcrypto/rsa/rsa_pss.c b/src/lib/libcrypto/rsa/rsa_pss.c
index ac211e2ffe..5f9f533d0c 100644
--- a/src/lib/libcrypto/rsa/rsa_pss.c
+++ b/src/lib/libcrypto/rsa/rsa_pss.c
@@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0};
 int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 			const EVP_MD *Hash, const unsigned char *EM, int sLen)
 	{
+	return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen);
+	}
+
+int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash,
+			const unsigned char *EM, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
@@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	unsigned char *DB = NULL;
 	EVP_MD_CTX ctx;
 	unsigned char H_[EVP_MAX_MD_SIZE];
+	EVP_MD_CTX_init(&ctx);
+
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
 
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
@@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	emLen = RSA_size(rsa);
 	if (EM[0] & (0xFF << MSBits))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID);
 		goto err;
 		}
 	if (MSBits == 0)
@@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 		}
 	if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE);
 		goto err;
 		}
 	if (EM[emLen - 1] != 0xbc)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID);
 		goto err;
 		}
 	maskedDBLen = emLen - hLen - 1;
@@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	DB = OPENSSL_malloc(maskedDBLen);
 	if (!DB)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE);
 		goto err;
 		}
-	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0)
+	if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0)
 		goto err;
 	for (i = 0; i < maskedDBLen; i++)
 		DB[i] ^= EM[i];
@@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ;
 	if (DB[i++] != 0x1)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED);
 		goto err;
 		}
 	if (sLen >= 0 && (maskedDBLen - i) != sLen)
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
-	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
 	if (maskedDBLen - i)
-		EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i);
-	EVP_DigestFinal(&ctx, H_, NULL);
-	EVP_MD_CTX_cleanup(&ctx);
+		{
+		if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i))
+			goto err;
+		}
+	if (!EVP_DigestFinal_ex(&ctx, H_, NULL))
+		goto err;
 	if (memcmp(H_, H, hLen))
 		{
-		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE);
+		RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE);
 		ret = 0;
 		}
 	else 
@@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash,
 	err:
 	if (DB)
 		OPENSSL_free(DB);
+	EVP_MD_CTX_cleanup(&ctx);
 
 	return ret;
 
@@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 			const unsigned char *mHash,
 			const EVP_MD *Hash, int sLen)
 	{
+	return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen);
+	}
+
+int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM,
+			const unsigned char *mHash,
+			const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen)
+	{
 	int i;
 	int ret = 0;
 	int hLen, maskedDBLen, MSBits, emLen;
 	unsigned char *H, *salt = NULL, *p;
 	EVP_MD_CTX ctx;
 
+	if (mgf1Hash == NULL)
+		mgf1Hash = Hash;
+
 	hLen = EVP_MD_size(Hash);
 	if (hLen < 0)
 		goto err;
@@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 	else if (sLen == -2)	sLen = -2;
 	else if (sLen < -2)
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED);
 		goto err;
 		}
 
@@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 		}
 	else if (emLen < (hLen + sLen + 2))
 		{
-		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
+		RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE);
 		goto err;
 		}
 	if (sLen > 0)
@@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 		salt = OPENSSL_malloc(sLen);
 		if (!salt)
 			{
-			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS,
-		   		ERR_R_MALLOC_FAILURE);
+			RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE);
 			goto err;
 			}
 		if (RAND_bytes(salt, sLen) <= 0)
@@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM,
 	maskedDBLen = emLen - hLen - 1;
 	H = EM + maskedDBLen;
 	EVP_MD_CTX_init(&ctx);
-	EVP_DigestInit_ex(&ctx, Hash, NULL);
-	EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes);
-	EVP_DigestUpdate(&ctx, mHash, hLen);
-	if (sLen)
-		EVP_DigestUpdate(&ctx, salt, sLen);
-	EVP_DigestFinal(&ctx, H, NULL);
+	if (!EVP_DigestInit_ex(&ctx, Hash, NULL)
+		|| !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes)
+		|| !EVP_DigestUpdate(&ctx, mHash, hLen))
+		goto err;
+	if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx, H, NULL))
+		goto err;
 	EVP_MD_CTX_cleanup(&ctx);
 
 	/* Generate dbMask in place then perform XOR on it */
-	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash))
+	if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash))
 		goto err;
 
 	p = EM;
diff --git a/src/lib/libcrypto/rsa/rsa_sign.c b/src/lib/libcrypto/rsa/rsa_sign.c
index 0be4ec7fb0..b6f6037ae0 100644
--- a/src/lib/libcrypto/rsa/rsa_sign.c
+++ b/src/lib/libcrypto/rsa/rsa_sign.c
@@ -77,6 +77,14 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len,
 	const unsigned char *s = NULL;
 	X509_ALGOR algor;
 	ASN1_OCTET_STRING digest;
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_RSA_SIGN, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
 	if((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign)
 		{
 		return rsa->meth->rsa_sign(type, m, m_len,
@@ -153,6 +161,15 @@ int int_rsa_verify(int dtype, const unsigned char *m,
 	unsigned char *s;
 	X509_SIG *sig=NULL;
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD)
+			&& !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW))
+		{
+		RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_NON_FIPS_RSA_METHOD);
+		return 0;
+		}
+#endif
+
 	if (siglen != (unsigned int)RSA_size(rsa))
 		{
 		RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_WRONG_SIGNATURE_LENGTH);
@@ -182,6 +199,22 @@ int int_rsa_verify(int dtype, const unsigned char *m,
 	i=RSA_public_decrypt((int)siglen,sigbuf,s,rsa,RSA_PKCS1_PADDING);
 
 	if (i <= 0) goto err;
+	/* Oddball MDC2 case: signature can be OCTET STRING.
+	 * check for correct tag and length octets.
+	 */
+	if (dtype == NID_mdc2 && i == 18 && s[0] == 0x04 && s[1] == 0x10)
+		{
+		if (rm)
+			{
+			memcpy(rm, s + 2, 16);
+			*prm_len = 16;
+			ret = 1;
+			}
+		else if(memcmp(m, s + 2, 16))
+			RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_BAD_SIGNATURE);
+		else
+			ret = 1;
+		}
 
 	/* Special case: SSL signature */
 	if(dtype == NID_md5_sha1) {
diff --git a/src/lib/libcrypto/s390xcap.c b/src/lib/libcrypto/s390xcap.c
index ffbe0235f9..f2e94ef47e 100644
--- a/src/lib/libcrypto/s390xcap.c
+++ b/src/lib/libcrypto/s390xcap.c
@@ -4,7 +4,7 @@
 #include <setjmp.h>
 #include <signal.h>
 
-extern unsigned long OPENSSL_s390xcap_P;
+extern unsigned long OPENSSL_s390xcap_P[];
 
 static sigjmp_buf ill_jmp;
 static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
@@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void)
 	sigset_t oset;
 	struct sigaction ill_act,oact;
 
-	if (OPENSSL_s390xcap_P) return;
+	if (OPENSSL_s390xcap_P[0]) return;
+
+	OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1);
 
 	memset(&ill_act,0,sizeof(ill_act));
 	ill_act.sa_handler = ill_handler;
@@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void)
 	sigaction (SIGILL,&ill_act,&oact);
 
 	/* protection against missing store-facility-list-extended */
-	if (sigsetjmp(ill_jmp,0) == 0)
-		OPENSSL_s390xcap_P = OPENSSL_s390x_facilities();
-	else
-		OPENSSL_s390xcap_P = 1UL<<63;
+	if (sigsetjmp(ill_jmp,1) == 0)
+		OPENSSL_s390x_facilities();
 
 	sigaction (SIGILL,&oact,NULL);
 	sigprocmask(SIG_SETMASK,&oset,NULL);
diff --git a/src/lib/libcrypto/s390xcpuid.S b/src/lib/libcrypto/s390xcpuid.S
index b053c6a281..06815347e6 100644
--- a/src/lib/libcrypto/s390xcpuid.S
+++ b/src/lib/libcrypto/s390xcpuid.S
@@ -5,10 +5,14 @@
 .align	16
 OPENSSL_s390x_facilities:
 	lghi	%r0,0
-	.long	0xb2b0f010	# stfle	16(%r15)
-	lg	%r2,16(%r15)
-	larl	%r1,OPENSSL_s390xcap_P
-	stg	%r2,0(%r1)
+	larl	%r2,OPENSSL_s390xcap_P
+	stg	%r0,8(%r2)
+	.long	0xb2b02000	# stfle	0(%r2)
+	brc	8,.Ldone
+	lghi	%r0,1
+	.long	0xb2b02000	# stfle 0(%r2)
+.Ldone:
+	lg	%r2,0(%r2)
 	br	%r14
 .size	OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities
 
@@ -58,6 +62,9 @@ OPENSSL_wipe_cpu:
 .type	OPENSSL_cleanse,@function
 .align	16
 OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+	llgfr	%r3,%r3
+#endif
 	lghi	%r4,15
 	lghi	%r0,0
 	clgr	%r3,%r4
@@ -89,4 +96,4 @@ OPENSSL_cleanse:
 .section	.init
 	brasl	%r14,OPENSSL_cpuid_setup
 
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl
index a1f876281a..1084d227fe 100644
--- a/src/lib/libcrypto/sha/asm/sha1-586.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-586.pl
@@ -12,6 +12,8 @@
 # commentary below], and in 2006 the rest was rewritten in order to
 # gain freedom to liberate licensing terms.
 
+# January, September 2004.
+#
 # It was noted that Intel IA-32 C compiler generates code which
 # performs ~30% *faster* on P4 CPU than original *hand-coded*
 # SHA1 assembler implementation. To address this problem (and
@@ -31,12 +33,92 @@
 # ----------------------------------------------------------------
 #					<appro@fy.chalmers.se>
 
+# August 2009.
+#
+# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as
+# '(c&d) + (b&(c^d))', which allows to accumulate partial results
+# and lighten "pressure" on scratch registers. This resulted in
+# >12% performance improvement on contemporary AMD cores (with no
+# degradation on other CPUs:-). Also, the code was revised to maximize
+# "distance" between instructions producing input to 'lea' instruction
+# and the 'lea' instruction itself, which is essential for Intel Atom
+# core and resulted in ~15% improvement.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel,
+# and in SSE2 context was first explored by Dean Gaudet in 2004, see
+# http://arctic.org/~dean/crypto/sha1.html. Since then several things
+# have changed that made it interesting again:
+#
+# a) XMM units became faster and wider;
+# b) instruction set became more versatile;
+# c) an important observation was made by Max Locktykhin, which made
+#    it possible to reduce amount of instructions required to perform
+#    the operation in question, for further details see
+#    http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/.
+
+# April 2011.
+#
+# Add AVX code path, probably most controversial... The thing is that
+# switch to AVX alone improves performance by as little as 4% in
+# comparison to SSSE3 code path. But below result doesn't look like
+# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as
+# pair of �-ops, and it's the additional �-ops, two per round, that
+# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded
+# as single �-op by Sandy Bridge and it's replacing 'ro[rl]' with
+# equivalent 'sh[rl]d' that is responsible for the impressive 5.1
+# cycles per processed byte. But 'sh[rl]d' is not something that used
+# to be fast, nor does it appear to be fast in upcoming Bulldozer
+# [according to its optimization manual]. Which is why AVX code path
+# is guarded by *both* AVX and synthetic bit denoting Intel CPUs.
+# One can argue that it's unfair to AMD, but without 'sh[rl]d' it
+# makes no sense to keep the AVX code path. If somebody feels that
+# strongly, it's probably more appropriate to discuss possibility of
+# using vector rotate XOP on AMD...
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86		SSSE3		AVX
+# Pentium	15.7		-
+# PIII		11.5		-
+# P4		10.6		-
+# AMD K8	7.1		-
+# Core2		7.3		6.1/+20%	-
+# Atom		12.5		9.5(*)/+32%	-
+# Westmere	7.3		5.6/+30%	-
+# Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
+#
+# (*)	Loop is 1056 instructions long and expected result is ~8.25.
+#	It remains mystery [to me] why ILP is limited to 1.7.
+#
+# (**)	As per above comment, the result is for AVX *plus* sh[rl]d.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386");
 
+$xmm=$ymm=0;
+for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+$ymm=1 if ($xmm &&
+		`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+			=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+		$1>=2.19);	# first version supporting AVX
+
+$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && 
+		`nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+		$1>=2.03);	# first version supporting AVX
+
+&external_label("OPENSSL_ia32cap_P") if ($xmm);
+
+
 $A="eax";
 $B="ebx";
 $C="ecx";
@@ -47,6 +129,10 @@ $tmp1="ebp";
 
 @V=($A,$B,$C,$D,$E,$T);
 
+$alt=0;	# 1 denotes alternative IALU implementation, which performs
+	# 8% *worse* on P4, same on Westmere and Atom, 2% better on
+	# Sandy Bridge...
+
 sub BODY_00_15
 	{
 	local($n,$a,$b,$c,$d,$e,$f)=@_;
@@ -59,16 +145,18 @@ sub BODY_00_15
 	&rotl($tmp1,5);			# tmp1=ROTATE(a,5)
 	 &xor($f,$d);
 	&add($tmp1,$e);			# tmp1+=e;
-	 &and($f,$b);
-	&mov($e,&swtmp($n%16));		# e becomes volatile and is loaded
+	 &mov($e,&swtmp($n%16));	# e becomes volatile and is loaded
 	 				# with xi, also note that e becomes
 					# f in next round...
-	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&and($f,$b);
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
+	 &xor($f,$d);			# f holds F_00_19(b,c,d)
+	&lea($tmp1,&DWP(0x5a827999,$tmp1,$e));	# tmp1+=K_00_19+xi
 
-	if ($n==15) { &add($f,$tmp1); }	# f+=tmp1
+	if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round
+		      &add($f,$tmp1); }	# f+=tmp1
 	else        { &add($tmp1,$f); }	# f becomes a in next round
+	&mov($tmp1,$a)			if ($alt && $n==15);
 	}
 
 sub BODY_16_19
@@ -77,22 +165,41 @@ sub BODY_16_19
 
 	&comment("16_19 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,$c);		# tmp1 to hold F_00_19(b,c,d)
-	&xor($f,&swtmp(($n+2)%16));
-	 &xor($tmp1,$d);
-	&xor($f,&swtmp(($n+8)%16));
-	 &and($tmp1,$b);		# tmp1 holds F_00_19(b,c,d)
-	&rotr($b,2);			# b=ROTATE(b,30)
+if ($alt) {
+	&xor($c,$d);
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&and($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d), b&=c^d
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($tmp1,$d);			# tmp1=F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($e,$tmp1);		# e+=F_00_19(b,c,d)
+	&xor($c,$d);			# restore $c
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,$n==16?2:7);		# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_00_19(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
 	 &xor($tmp1,$d);		# tmp1=F_00_19(b,c,d)
-	&mov(&swtmp($n%16),$f);		# xi=f
-	&lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=F_00_19(b,c,d)
-	&add($f,$e);			# f+=ROTATE(a,5)
+	&add($e,$tmp1);			# e+=F_00_19(b,c,d)
+	 &mov($tmp1,$a);
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_20_39
@@ -102,21 +209,41 @@ sub BODY_20_39
 
 	&comment("20_39 $n");
 
+if ($alt) {
+	&xor($tmp1,$c);			# tmp1 to hold F_20_39(b,c,d), b^=c
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+8)%16));
+	&add($e,$tmp1);			# e+=F_20_39(b,c,d)
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &mov($tmp1,$a);		# b in next round
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &mov(&swtmp($n%16),$f)		if($n<77);# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if($n==39);# warm up for BODY_40_59
+	&and($tmp1,$b)			if($n==39);
+	 &lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	&mov($e,&swtmp(($n+1)%16))	if($n<79);# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+	&rotr($a,5)			if ($n==79);
+} else {
 	&mov($tmp1,$b);			# tmp1 to hold F_20_39(b,c,d)
-	 &mov($f,&swtmp($n%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
-	&rotr($b,2);			# b=ROTATE(b,30)
-	 &xor($f,&swtmp(($n+2)%16));
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
 	&xor($tmp1,$c);
 	 &xor($f,&swtmp(($n+8)%16));
 	&xor($tmp1,$d);			# tmp1 holds F_20_39(b,c,d)
 	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &add($tmp1,$e);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &mov($e,$a);			# e becomes volatile
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &lea($f,&DWP($K,$f,$tmp1));	# f+=K_20_39+e
-	&add($f,$e);			# f+=ROTATE(a,5)
+	 &add($e,$tmp1);		# e+=F_20_39(b,c,d)
+	&rotr($b,2);			# b=ROTATE(b,30)
+	 &mov($tmp1,$a);
+	&rotl($tmp1,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f) if($n<77);# xi=f
+	&lea($f,&DWP($K,$f,$e));	# f+=e+K_XX_YY
+	 &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round
+	&add($f,$tmp1);			# f+=ROTATE(a,5)
+}
 	}
 
 sub BODY_40_59
@@ -125,41 +252,86 @@ sub BODY_40_59
 
 	&comment("40_59 $n");
 
-	&mov($f,&swtmp($n%16));		# f to hold Xupdate(xi,xa,xb,xc,xd)
-	 &mov($tmp1,&swtmp(($n+2)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+8)%16));
-	&xor($f,$tmp1);
-	 &mov($tmp1,&swtmp(($n+13)%16));
-	&xor($f,$tmp1);			# f holds xa^xb^xc^xd
-	 &mov($tmp1,$b);		# tmp1 to hold F_40_59(b,c,d)
+if ($alt) {
+	&add($e,$tmp1);			# e+=b&(c^d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&mov($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&xor($c,$d);			# restore $c
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
 	&rotl($f,1);			# f=ROTATE(f,1)
-	 &or($tmp1,$c);
-	&mov(&swtmp($n%16),$f);		# xi=f
-	 &and($tmp1,$d);
-	&lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e
-	 &mov($e,$b);			# e becomes volatile and is used
-					# to calculate F_40_59(b,c,d)
+	 &and($tmp1,$c);
+	&rotr($b,7);			# b=ROTATE(b,30)
+	 &add($e,$tmp1);		# e+=c&d
+	&mov($tmp1,$a);			# b in next round
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&rotl($a,5);			# ROTATE(a,5)
+	 &xor($b,$c)			if ($n<59);
+	&and($tmp1,$b)			if ($n<59);# tmp1 to hold F_40_59(b,c,d)
+	 &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d))
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$a);			# f+=ROTATE(a,5)
+} else {
+	&mov($tmp1,$c);			# tmp1 to hold F_40_59(b,c,d)
+	 &xor($f,&swtmp(($n+2)%16));	# f to hold Xupdate(xi,xa,xb,xc,xd)
+	&xor($tmp1,$d);
+	 &xor($f,&swtmp(($n+8)%16));
+	&and($tmp1,$b);
+	 &xor($f,&swtmp(($n+13)%16));	# f holds xa^xb^xc^xd
+	&rotl($f,1);			# f=ROTATE(f,1)
+	 &add($tmp1,$e);		# b&(c^d)+=e
 	&rotr($b,2);			# b=ROTATE(b,30)
-	 &and($e,$c);
-	&or($tmp1,$e);			# tmp1 holds F_40_59(b,c,d)		
-	 &mov($e,$a);
-	&rotl($e,5);			# e=ROTATE(a,5)
-	 &add($f,$tmp1);		# f+=tmp1;
+	 &mov($e,$a);			# e becomes volatile
+	&rotl($e,5);			# ROTATE(a,5)
+	 &mov(&swtmp($n%16),$f);	# xi=f
+	&lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d))
+	 &mov($tmp1,$c);
 	&add($f,$e);			# f+=ROTATE(a,5)
+	 &and($tmp1,$d);
+	&mov($e,&swtmp(($n+1)%16));	# pre-fetch f for next round
+	 &add($f,$tmp1);		# f+=c&d
+}
 	}
 
 &function_begin("sha1_block_data_order");
+if ($xmm) {
+  &static_label("ssse3_shortcut");
+  &static_label("avx_shortcut")		if ($ymm);
+  &static_label("K_XX_XX");
+
+	&call	(&label("pic_point"));	# make it PIC!
+  &set_label("pic_point");
+	&blindpop($tmp1);
+	&picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point"));
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+
+	&mov	($A,&DWP(0,$T));
+	&mov	($D,&DWP(4,$T));
+	&test	($D,1<<9);		# check SSSE3 bit
+	&jz	(&label("x86"));
+	&test	($A,1<<24);		# check FXSR bit
+	&jz	(&label("x86"));
+	if ($ymm) {
+		&and	($D,1<<28);		# mask AVX bit
+		&and	($A,1<<30);		# mask "Intel CPU" bit
+		&or	($A,$D);
+		&cmp	($A,1<<28|1<<30);
+		&je	(&label("avx_shortcut"));
+	}
+	&jmp	(&label("ssse3_shortcut"));
+  &set_label("x86",16);
+}
 	&mov($tmp1,&wparam(0));	# SHA_CTX *c
 	&mov($T,&wparam(1));	# const void *input
 	&mov($A,&wparam(2));	# size_t num
-	&stack_push(16);	# allocate X[16]
+	&stack_push(16+3);	# allocate X[16]
 	&shl($A,6);
 	&add($A,$T);
 	&mov(&wparam(2),$A);	# pointer beyond the end of input
 	&mov($E,&DWP(16,$tmp1));# pre-load E
+	&jmp(&label("loop"));
 
-	&set_label("loop",16);
+&set_label("loop",16);
 
 	# copy input chunk to X, but reversing byte order!
 	for ($i=0; $i<16; $i+=4)
@@ -213,8 +385,845 @@ sub BODY_40_59
 	&mov(&DWP(16,$tmp1),$C);
 	&jb(&label("loop"));
 
-	&stack_pop(16);
+	&stack_pop(16+3);
 &function_end("sha1_block_data_order");
+
+if ($xmm) {
+######################################################################
+# The SSSE3 implementation.
+#
+# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last
+# 32 elements of the message schedule or Xupdate outputs. First 4
+# quadruples are simply byte-swapped input, next 4 are calculated
+# according to method originally suggested by Dean Gaudet (modulo
+# being implemented in SSSE3). Once 8 quadruples or 32 elements are
+# collected, it switches to routine proposed by Max Locktyukhin.
+#
+# Calculations inevitably require temporary reqisters, and there are
+# no %xmm registers left to spare. For this reason part of the ring
+# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring
+# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] -
+# X[-5], and X[4] - X[-4]...
+#
+# Another notable optimization is aggressive stack frame compression
+# aiming to minimize amount of 9-byte instructions...
+#
+# Yet another notable optimization is "jumping" $B variable. It means
+# that there is no register permanently allocated for $B value. This
+# allowed to eliminate one instruction from body_20_39...
+#
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+&function_begin("_sha1_block_data_order_ssse3");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("ssse3_shortcut");
+
+	&movdqa	(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&movdqa	(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&movdqa	(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&movdqa	(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&movdqa	(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&movdqa	(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&movdqa	(&QWP(112+16,"esp"),@X[5]);
+	&movdqa	(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&movdqa	(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&movdqa	(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&movdqu	(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&movdqu	(@X[-3&7],&QWP(-48,$inp));
+	&movdqu	(@X[-2&7],&QWP(-32,$inp));
+	&movdqu	(@X[-1&7],&QWP(-16,$inp));
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&pshufb	(@X[-3&7],@X[2]);
+	&pshufb	(@X[-2&7],@X[2]);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&pshufb	(@X[-1&7],@X[2]);
+	&paddd	(@X[-4&7],@X[3]);		# add K_00_19
+	&paddd	(@X[-3&7],@X[3]);
+	&paddd	(@X[-2&7],@X[3]);
+	&movdqa	(&QWP(0,"esp"),@X[-4&7]);	# X[]+K xfer to IALU
+	&psubd	(@X[-4&7],@X[3]);		# restore X[]
+	&movdqa	(&QWP(0+16,"esp"),@X[-3&7]);
+	&psubd	(@X[-3&7],@X[3]);
+	&movdqa	(&QWP(0+32,"esp"),@X[-2&7]);
+	&psubd	(@X[-2&7],@X[3]);
+	&movdqa	(@X[0],@X[-3&7]);
+	&jmp	(&label("loop"));
+
+######################################################################
+# SSE instruction sequence is first broken to groups of indepentent
+# instructions, independent in respect to their inputs and shifter
+# (not all architectures have more than one). Then IALU instructions
+# are "knitted in" between the SSE groups. Distance is maintained for
+# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer
+# [which allegedly also implements SSSE3]...
+#
+# Temporary registers usage. X[2] is volatile at the entry and at the
+# end is restored from backtrace ring buffer. X[3] is expected to
+# contain current K_XX_XX constant and is used to caclulate X[-1]+K
+# from previous round, it becomes volatile the moment the value is
+# saved to stack for transfer to IALU. X[4] becomes volatile whenever
+# X[-4] is accumulated and offloaded to backtrace ring buffer, at the
+# end it is loaded with next K_XX_XX [which becomes X[3] in next
+# round]...
+#
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	&movdqa	(@X[2],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@X[3],@X[-1&7]);
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@X[2],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[2],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@X[4],@X[0]);
+	&movdqa	(@X[2],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@X[4],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[2],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@X[3],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@X[4],30);
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@X[3],2);
+	&pxor	(@X[0],@X[4]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[3]);		# "X[0]"^=("X[0]"<<96)<<<2
+	  &movdqa	(@X[1],@X[-2&7])	if ($Xi<7);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[2],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@X[2],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &movdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &movdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &movdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@X[2],@X[0]);
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@X[2],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@X[2]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	  &movdqa	(@X[3],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&movdqa	(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&movdqa	(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&movdqu	(@X[-4&7],&QWP(0,$inp));	# load input
+	&movdqu	(@X[-3&7],&QWP(16,$inp));
+	&movdqu	(@X[-2&7],&QWP(32,$inp));
+	&movdqu	(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&movdqa	(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@X[3]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j&15),"esp"));',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,&DWP(4*($j++&15),"esp"));',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+&set_label("loop",16);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+	&movdqa	(@X[0],@X[-3&7]);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+&function_end("_sha1_block_data_order_ssse3");
+
+if ($ymm) {
+my $Xi=4;			# 4xSIMD Xupdate round, start pre-seeded
+my @X=map("xmm$_",(4..7,0..3));	# pre-seeded for $Xi=4
+my @V=($A,$B,$C,$D,$E);
+my $j=0;			# hash round
+my @T=($T,$tmp1);
+my $inp;
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+&function_begin("_sha1_block_data_order_avx");
+	&call	(&label("pic_point"));	# make it PIC!
+	&set_label("pic_point");
+	&blindpop($tmp1);
+	&lea	($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1));
+&set_label("avx_shortcut");
+	&vzeroall();
+
+	&vmovdqa(@X[3],&QWP(0,$tmp1));		# K_00_19
+	&vmovdqa(@X[4],&QWP(16,$tmp1));		# K_20_39
+	&vmovdqa(@X[5],&QWP(32,$tmp1));		# K_40_59
+	&vmovdqa(@X[6],&QWP(48,$tmp1));		# K_60_79
+	&vmovdqa(@X[2],&QWP(64,$tmp1));		# pbswap mask
+
+	&mov	($E,&wparam(0));		# load argument block
+	&mov	($inp=@T[1],&wparam(1));
+	&mov	($D,&wparam(2));
+	&mov	(@T[0],"esp");
+
+	# stack frame layout
+	#
+	# +0	X[0]+K	X[1]+K	X[2]+K	X[3]+K	# XMM->IALU xfer area
+	#	X[4]+K	X[5]+K	X[6]+K	X[7]+K
+	#	X[8]+K	X[9]+K	X[10]+K	X[11]+K
+	#	X[12]+K	X[13]+K	X[14]+K	X[15]+K
+	#
+	# +64	X[0]	X[1]	X[2]	X[3]	# XMM->XMM backtrace area
+	#	X[4]	X[5]	X[6]	X[7]
+	#	X[8]	X[9]	X[10]	X[11]	# even borrowed for K_00_19
+	#
+	# +112	K_20_39	K_20_39	K_20_39	K_20_39	# constants
+	#	K_40_59	K_40_59	K_40_59	K_40_59
+	#	K_60_79	K_60_79	K_60_79	K_60_79
+	#	K_00_19	K_00_19	K_00_19	K_00_19
+	#	pbswap mask
+	#
+	# +192	ctx				# argument block
+	# +196	inp
+	# +200	end
+	# +204	esp
+	&sub	("esp",208);
+	&and	("esp",-64);
+
+	&vmovdqa(&QWP(112+0,"esp"),@X[4]);	# copy constants
+	&vmovdqa(&QWP(112+16,"esp"),@X[5]);
+	&vmovdqa(&QWP(112+32,"esp"),@X[6]);
+	&shl	($D,6);				# len*64
+	&vmovdqa(&QWP(112+48,"esp"),@X[3]);
+	&add	($D,$inp);			# end of input
+	&vmovdqa(&QWP(112+64,"esp"),@X[2]);
+	&add	($inp,64);
+	&mov	(&DWP(192+0,"esp"),$E);		# save argument block
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&mov	(&DWP(192+8,"esp"),$D);
+	&mov	(&DWP(192+12,"esp"),@T[0]);	# save original %esp
+
+	&mov	($A,&DWP(0,$E));		# load context
+	&mov	($B,&DWP(4,$E));
+	&mov	($C,&DWP(8,$E));
+	&mov	($D,&DWP(12,$E));
+	&mov	($E,&DWP(16,$E));
+	&mov	(@T[0],$B);			# magic seed
+
+	&vmovdqu(@X[-4&7],&QWP(-64,$inp));	# load input to %xmm[0-3]
+	&vmovdqu(@X[-3&7],&QWP(-48,$inp));
+	&vmovdqu(@X[-2&7],&QWP(-32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(-16,$inp));
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&vpshufb(@X[-3&7],@X[-3&7],@X[2]);
+	&vpshufb(@X[-2&7],@X[-2&7],@X[2]);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+	&vpshufb(@X[-1&7],@X[-1&7],@X[2]);
+	&vpaddd	(@X[0],@X[-4&7],@X[3]);		# add K_00_19
+	&vpaddd	(@X[1],@X[-3&7],@X[3]);
+	&vpaddd	(@X[2],@X[-2&7],@X[3]);
+	&vmovdqa(&QWP(0,"esp"),@X[0]);		# X[]+K xfer to IALU
+	&vmovdqa(&QWP(0+16,"esp"),@X[1]);
+	&vmovdqa(&QWP(0+32,"esp"),@X[2]);
+	&jmp	(&label("loop"));
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@X[2],@X[-1&7],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[2],@X[2],@X[-2&7]);		# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[2],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@X[4],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@X[3],@X[4],30);
+	&vpor	(@X[0],@X[0],@X[2]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@X[4],@X[4],2);
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[4]);		# "X[0]"^=("X[0]"<<96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[4],&QWP(112-16+16*(($Xi)/5),"esp"));	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@X[2],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	  &vmovdqa	(&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);	# save X[] to backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 if ($Xi%5) {
+	  &vmovdqa	(@X[4],@X[3]);	# "perpetuate" K_XX_XX...
+	 } else {			# ... or load next one
+	  &vmovdqa	(@X[4],&QWP(112-16+16*($Xi/5),"esp"));
+	 }
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@X[2]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@X[2],@X[0],30);
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@X[2]);	# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19);	# restore X[] from backtrace buffer
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@X[3],@X[3],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vmovdqa	(&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&mov	($inp=@T[1],&DWP(192+4,"esp"));
+	&cmp	($inp,&DWP(192+8,"esp"));
+	&je	(&label("done"));
+
+	&vmovdqa(@X[3],&QWP(112+48,"esp"));	# K_00_19
+	&vmovdqa(@X[2],&QWP(112+64,"esp"));	# pbswap mask
+	&vmovdqu(@X[-4&7],&QWP(0,$inp));	# load input
+	&vmovdqu(@X[-3&7],&QWP(16,$inp));
+	&vmovdqu(@X[-2&7],&QWP(32,$inp));
+	&vmovdqu(@X[-1&7],&QWP(48,$inp));
+	&add	($inp,64);
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);		# byte swap
+	&mov	(&DWP(192+4,"esp"),$inp);
+	&vmovdqa(&QWP(112-16,"esp"),@X[3]);	# borrow last backtrace slot
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb	(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@X[3]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa	(&QWP(0+16*$Xi,"esp"),@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+&set_label("loop",16);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	($B,@T[0]);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+
+	&jmp	(&label("loop"));
+
+&set_label("done",16);		$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+	&vzeroall();
+
+	&mov	(@T[1],&DWP(192,"esp"));	# update context
+	&add	($A,&DWP(0,@T[1]));
+	&mov	("esp",&DWP(192+12,"esp"));	# restore %esp
+	&add	(@T[0],&DWP(4,@T[1]));		# $b
+	&add	($C,&DWP(8,@T[1]));
+	&mov	(&DWP(0,@T[1]),$A);
+	&add	($D,&DWP(12,@T[1]));
+	&mov	(&DWP(4,@T[1]),@T[0]);
+	&add	($E,&DWP(16,@T[1]));
+	&mov	(&DWP(8,@T[1]),$C);
+	&mov	(&DWP(12,@T[1]),$D);
+	&mov	(&DWP(16,@T[1]),$E);
+&function_end("_sha1_block_data_order_avx");
+}
+&set_label("K_XX_XX",64);
+&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999);	# K_00_19
+&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1);	# K_20_39
+&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc);	# K_40_59
+&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6);	# K_60_79
+&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# pbswap mask
+}
 &asciz("SHA1 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
 
 &asm_finish();
diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
new file mode 100644
index 0000000000..6c4b9251fd
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-alpha.pl
@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Alpha.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+
+@X=(	"\$0",	"\$1",	"\$2",	"\$3",	"\$4",	"\$5",	"\$6",	"\$7",
+	"\$8",	"\$9",	"\$10",	"\$11",	"\$12",	"\$13",	"\$14",	"\$15");
+$ctx="a0";	# $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4";	# 20
+$C="a5";
+$D="t8";
+$E="t9";	@V=($A,$B,$C,$D,$E);
+$t0="t10";	# 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT";	# 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+	ldq_u	@X[0],0+0($inp)
+	ldq_u	@X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+	ldq_u	@X[$i+2],($i+2)*4+0($inp)
+	ldq_u	@X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+	extql	@X[$i],$inp,@X[$i]
+	extqh	@X[$i+1],$inp,@X[$i+1]
+
+	or	@X[$i+1],@X[$i],@X[$i]	# pair of 32-bit values are fetched
+
+	srl	@X[$i],24,$t0		# vectorized byte swap
+	srl	@X[$i],8,$t2
+
+	sll	@X[$i],8,$t3
+	sll	@X[$i],24,@X[$i]
+	zapnot	$t0,0x11,$t0
+	zapnot	$t2,0x22,$t2
+
+	zapnot	@X[$i],0x88,@X[$i]
+	or	$t0,$t2,$t0
+	zapnot	$t3,0x44,$t3
+	sll	$a,5,$t1
+
+	or	@X[$i],$t0,@X[$i]
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	or	@X[$i],$t3,@X[$i]
+	srl	$a,27,$t0
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	extll	@X[$i],4,@X[$i+1]	# extract upper half
+	or	$t2,$t3,$t2
+	addl	@X[$i],$e,$e
+
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	zapnot	$a,0xf,$a
+
+	srl	$a,27,$t0
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	sll	$b,30,$b
+
+	or	$t2,$t3,$t2
+	addl	$t1,$e,$e
+	srl	$b,32,$t3
+	zapnot	@X[$i],0xf,@X[$i]
+
+	addl	$t0,$e,$e
+	addl	$t2,$e,$e
+	or	$t3,$b,$b
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	and	$b,$c,$t2
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	zapnot	$a,0xf,$a
+	addl	@X[$i%16],$e,$e
+	bic	$d,$b,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	addl	$t1,$e,$e
+	or	$t2,$t3,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t0,$e,$e
+	srl	@X[$j%16],31,$t1
+
+	addl	$t2,$e,$e
+	srl	$b,32,$t3
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t2,$e,$e
+	srl	$a,27,$t0
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79);	# with context fetch
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	ldl	@X[0],0($ctx)
+
+	sll	$b,30,$t3
+	addl	$t1,$e,$e
+	xor	$b,$c,$t2
+	ldl	@X[1],4($ctx)
+
+	srl	$b,2,$b
+	addl	@X[$i%16],$e,$e
+	xor	$d,$t2,$t2
+	ldl	@X[2],8($ctx)
+
+	srl	$a,27,$t0
+	addl	$t2,$e,$e
+	ldl	@X[3],12($ctx)
+
+	or	$t3,$b,$b
+	addl	$t0,$e,$e
+	ldl	@X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;	# with forward Xupdate
+	sll	$a,5,$t1
+	addl	$K,$e,$e
+	zapnot	$a,0xf,$a
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+	srl	$a,27,$t0
+	and	$b,$c,$t2
+	and	$b,$d,$t3
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+	sll	$b,30,$b
+	addl	$t1,$e,$e
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+	srl	@X[$j%16],31,$t1
+	addl	$t0,$e,$e
+	or	$t2,$t3,$t2
+	and	$c,$d,$t3
+
+	or	$t2,$t3,$t2
+	srl	$b,32,$t3
+	addl	@X[$i%16],$e,$e
+	addl	@X[$j%16],@X[$j%16],@X[$j%16]
+
+	or	$t3,$b,$b
+	addl	$t2,$e,$e
+	or	$t1,@X[$j%16],@X[$j%16]
+	zapnot	@X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.globl	sha1_block_data_order
+.align	5
+.ent	sha1_block_data_order
+sha1_block_data_order:
+	lda	sp,-64(sp)
+	stq	ra,0(sp)
+	stq	s0,8(sp)
+	stq	s1,16(sp)
+	stq	s2,24(sp)
+	stq	s3,32(sp)
+	stq	s4,40(sp)
+	stq	s5,48(sp)
+	stq	fp,56(sp)
+	.mask	0x0400fe00,-64
+	.frame	sp,64,ra
+	.prologue 0
+
+	ldl	$A,0($ctx)
+	ldl	$B,4($ctx)
+	sll	$num,6,$num
+	ldl	$C,8($ctx)
+	ldl	$D,12($ctx)
+	ldl	$E,16($ctx)
+	addq	$inp,$num,$num
+
+.Lloop:
+	.set	noreorder
+	ldah	$K,23170(zero)
+	zapnot	$B,0xf,$B
+	lda	$K,31129($K)	# K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,28378(zero)
+	lda	$K,-5215($K)	# K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-28900(zero)
+	lda	$K,-17188($K)	# K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	ldah	$K,-13725(zero)
+	lda	$K,-15914($K)	# K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stl	$A,0($ctx)
+	stl	$B,4($ctx)
+	addq	$inp,64,$inp
+	stl	$C,8($ctx)
+	stl	$D,12($ctx)
+	stl	$E,16($ctx)
+	cmpult	$inp,$num,$t1
+	bne	$t1,.Lloop
+
+	.set	noreorder
+	ldq	ra,0(sp)
+	ldq	s0,8(sp)
+	ldq	s1,16(sp)
+	ldq	s2,24(sp)
+	ldq	s3,32(sp)
+	ldq	s4,40(sp)
+	ldq	s5,48(sp)
+	ldq	fp,56(sp)
+	lda	sp,64(sp)
+	ret	(ra)
+.end	sha1_block_data_order
+.ascii	"SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
index 6e65fe3e01..fe8207f77f 100644
--- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl
@@ -47,6 +47,10 @@
 # Cortex A8 core and in absolute terms ~870 cycles per input block
 # [or 13.6 cycles per byte].
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 10%
+# improvement on Cortex A8 core and 12.2 cycles per byte.
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -76,31 +80,41 @@ $code.=<<___;
 	add	$e,$K,$e,ror#2			@ E+=K_xx_xx
 	ldr	$t3,[$Xi,#2*4]
 	eor	$t0,$t0,$t1
-	eor	$t2,$t2,$t3
+	eor	$t2,$t2,$t3			@ 1 cycle stall
 	eor	$t1,$c,$d			@ F_xx_xx
 	mov	$t0,$t0,ror#31
 	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
 	eor	$t0,$t0,$t2,ror#31
+	str	$t0,[$Xi,#-4]!
 	$opt1					@ F_xx_xx
 	$opt2					@ F_xx_xx
 	add	$e,$e,$t0			@ E+=X[i]
-	str	$t0,[$Xi,#-4]!
 ___
 }
 
 sub BODY_00_15 {
 my ($a,$b,$c,$d,$e)=@_;
 $code.=<<___;
-	ldrb	$t0,[$inp],#4
-	ldrb	$t1,[$inp,#-1]
-	ldrb	$t2,[$inp,#-2]
+#if __ARM_ARCH__<7
+	ldrb	$t1,[$inp,#2]
+	ldrb	$t0,[$inp,#3]
+	ldrb	$t2,[$inp,#1]
 	add	$e,$K,$e,ror#2			@ E+=K_00_19
-	ldrb	$t3,[$inp,#-3]
+	ldrb	$t3,[$inp],#4
+	orr	$t0,$t0,$t1,lsl#8
+	eor	$t1,$c,$d			@ F_xx_xx
+	orr	$t0,$t0,$t2,lsl#16
 	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
-	orr	$t0,$t1,$t0,lsl#24
+	orr	$t0,$t0,$t3,lsl#24
+#else
+	ldr	$t0,[$inp],#4			@ handles unaligned
+	add	$e,$K,$e,ror#2			@ E+=K_00_19
 	eor	$t1,$c,$d			@ F_xx_xx
-	orr	$t0,$t0,$t2,lsl#8
-	orr	$t0,$t0,$t3,lsl#16
+	add	$e,$e,$a,ror#27			@ E+=ROR(A,27)
+#ifdef __ARMEL__
+	rev	$t0,$t0				@ byte swap
+#endif
+#endif
 	and	$t1,$b,$t1,ror#2
 	add	$e,$e,$t0			@ E+=X[i]
 	eor	$t1,$t1,$d,ror#2		@ F_00_19(B,C,D)
@@ -136,6 +150,8 @@ ___
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 
 .global	sha1_block_data_order
@@ -209,10 +225,14 @@ $code.=<<___;
 	teq	$inp,$len
 	bne	.Lloop			@ [+18], total 1307
 
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .align	2
 .LK_00_19:	.word	0x5a827999
 .LK_20_39:	.word	0x6ed9eba1
diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
index 51c4f47ecb..db28f0805a 100644
--- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-ia64.pl
@@ -15,7 +15,7 @@
 # is >50% better than HP C and >2x better than gcc.
 
 $code=<<___;
-.ident  \"sha1-ia64.s, version 1.2\"
+.ident  \"sha1-ia64.s, version 1.3\"
 .ident  \"IA-64 ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>\"
 .explicit
 
@@ -26,14 +26,10 @@ if ($^O eq "hpux") {
     $ADDP="addp4";
     for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
 } else { $ADDP="add"; }
-for (@ARGV) {	$big_endian=1 if (/\-DB_ENDIAN/);
-		$big_endian=0 if (/\-DL_ENDIAN/);   }
-if (!defined($big_endian))
-	    {	$big_endian=(unpack('L',pack('N',1))==1);   }
 
 #$human=1;
 if ($human) {	# useful for visual code auditing...
-	($A,$B,$C,$D,$E,$T)   = ("A","B","C","D","E","T");
+	($A,$B,$C,$D,$E)   = ("A","B","C","D","E");
 	($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
 	    (	"K_00_19","K_20_39","K_40_59","K_60_79"	);
@@ -41,47 +37,50 @@ if ($human) {	# useful for visual code auditing...
 		"X8", "X9","X10","X11","X12","X13","X14","X15"	);
 }
 else {
-	($A,$B,$C,$D,$E,$T)   = ("loc0","loc1","loc2","loc3","loc4","loc5");
-	($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10");
+	($A,$B,$C,$D,$E)   =    ("loc0","loc1","loc2","loc3","loc4");
+	($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9");
 	($K_00_19, $K_20_39, $K_40_59, $K_60_79) =
-	    (	"r14", "r15", "loc11", "loc12"	);
+	    (	"r14", "r15", "loc10", "loc11"	);
 	@X= (	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
 		"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"	);
 }
 
 sub BODY_00_15 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___ if ($i==0);
-{ .mmi;	ld1	$X[$i&0xf]=[inp],2	    // MSB
+{ .mmi;	ld1	$X[$i]=[inp],2		    // MSB
 	ld1	tmp2=[tmp3],2		};;
 { .mmi;	ld1	tmp0=[inp],2
 	ld1	tmp4=[tmp3],2		    // LSB
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp2,8,8	};;
+	dep	$X[$i]=$X[$i],tmp2,8,8	};;
 ___
 if ($i<15) {
 	$code.=<<___;
-{ .mmi;	ld1	$X[($i+1)&0xf]=[inp],2	    // +1
+{ .mmi;	ld1	$Xn=[inp],2		    // forward Xload
+	nop.m	0x0
 	dep	tmp1=tmp0,tmp4,8,8	};;
-{ .mmi;	ld1	tmp2=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp2=[tmp3],2		    // forward Xload
 	and	tmp4=$c,$b
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xload
+	or	tmp4=tmp4,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		};; // a>>27
-{ .mmi;	ld1	tmp0=[inp],2		    // +1
-	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
+{ .mmi;	ld1	tmp0=[inp],2		    // forward Xload
+	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	ld1	tmp4=[tmp3],2		    // +1
+{ .mmi;	ld1	tmp4=[tmp3],2		    // forward Xload
 	or	tmp5=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp5		    // f+=ROTATE(a,5)
-	dep	$X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8	// +1
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44	} //;;
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)
+	dep	$Xn=$Xn,tmp2,8,8	    // forward Xload
+	mux2	$X[$i]=$X[$i],0x44	} //;;
 
 ___
 	}
@@ -89,24 +88,24 @@ else	{
 	$code.=<<___;
 { .mii;	and	tmp3=$c,$b
 	dep	tmp1=tmp0,tmp4,8,8;;
-	dep	$X[$i&0xf]=$X[$i&0xf],tmp1,16,16	} //;;
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp0=$e,$K_00_19
+	dep	$X[$i]=$X[$i],tmp1,16,16} //;;
+{ .mmi;	add	$e=$e,$K_00_19		    // e+=K_00_19
+	andcm	tmp1=$d,$b
 	dep.z	tmp5=$a,5,27		};; // a<<5
-{ .mmi;	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=tmp0,$X[$i&0xf]	    // f=xi+e+K_00_19
+{ .mmi;	add	$e=$e,$X[$i]		    // e+=Xupdate
+	or	tmp4=tmp3,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp4		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp4		    // e+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi; or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	mux2	$X[$i&0xf]=$X[$i&0xf],0x44  };;
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	mux2	$X[$i]=$X[$i],0x44	};;
 
 ___
 	}
@@ -114,27 +113,28 @@ ___
 
 sub BODY_16_19 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_00_19		    // e+=K_00_19
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	andcm	tmp1=$d,$b
-	add	tmp4=$e,$K_00_19	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
-	add	$f=$f,tmp4		    // f+=e+K_00_19
+{ .mib;	andcm	tmp1=$d,$b
+	and	tmp0=$c,$b		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	or	tmp0=tmp0,tmp1		    // F_00_19(b,c,d)=(b&c)|(~b&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
+{ .mmi;	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16]	// forward Xupdate
 	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_00_19(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	add	$e=$e,tmp0		    // f+=F_00_19(b,c,d)
+	xor	$Xn=$Xn,tmp3		    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
@@ -142,49 +142,47 @@ ___
 
 sub BODY_20_39 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f,$Konst)=@_;
+my	($i,$a,$b,$c,$d,$e,$Konst)=@_;
 	$Konst = $K_20_39 if (!defined($Konst));
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 if ($i<79) {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_XX_XX
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mmi;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	add	$f=$f,tmp4		    // f+=e+K_20_39
+	xor	$Xn=$Xn,$X[($j+2)%16]	};; // forward Xupdate
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	nop.i	0			};;
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
+	xor	$Xn=$Xn,$X[($j+8)%16]	};; // forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	xor	$Xn=$Xn,$X[($j+13)%16]	    // forward Xupdate
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
 { .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp1		    // f+=ROTATE(a,5)
-	shrp	$e=tmp2,tmp2,31		    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
+{ .mii;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
 	nop.i	0			};;
 
 ___
 }
 else {
 $code.=<<___;
-{ .mib;	mov	$X[$i&0xf]=$f		    // Xupdate
+{ .mib;	add	$e=$e,$Konst		    // e+=K_60_79
 	dep.z	tmp5=$a,5,27		}   // a<<5
 { .mib;	xor	tmp0=$c,$b
-	add	tmp4=$e,$Konst		};;
-{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
-	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mib;	add	$f=$f,tmp4		    // f+=e+K_20_39
 	add	$h1=$h1,$a		};; // wrap up
-{ .mmi;	add	$f=$f,tmp0		    // f+=F_20_39(b,c,d)
-	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30) ;;?
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+{ .mib;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	extr.u	tmp1=$a,27,5		}   // a>>27
+{ .mib;	xor	tmp0=tmp0,$d		    // F_20_39(b,c,d)=b^c^d
 	add	$h3=$h3,$c		};; // wrap up
-{ .mib;	add	tmp3=1,inp		    // used in unaligned codepath
-	add	$f=$f,tmp1		}   // f+=ROTATE(a,5)
-{ .mib;	add	$h2=$h2,$b		    // wrap up
+{ .mmi;	add	$e=$e,tmp0		    // e+=F_20_39(b,c,d)
+	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
+	shrp	$b=tmp6,tmp6,2		};; // b=ROTATE(b,30) ;;?
+{ .mmi;	add	$e=$e,tmp1		    // e+=ROTATE(a,5)
+	add	tmp3=1,inp		    // used in unaligned codepath
 	add	$h4=$h4,$d		};; // wrap up
 
 ___
@@ -193,29 +191,29 @@ ___
 
 sub BODY_40_59 {
 local	*code=shift;
-local	($i,$a,$b,$c,$d,$e,$f)=@_;
+my	($i,$a,$b,$c,$d,$e)=@_;
+my	$j=$i+1;
+my	$Xn=@X[$j%16];
 
 $code.=<<___;
-{ .mmi;	mov	$X[$i&0xf]=$f		    // Xupdate
-	and	tmp0=$c,$b
+{ .mib;	add	$e=$e,$K_40_59		    // e+=K_40_59
 	dep.z	tmp5=$a,5,27		}   // a<<5
-{ .mmi;	and	tmp1=$d,$b
-	add	tmp4=$e,$K_40_59	};;
-{ .mmi;	or	tmp0=tmp0,tmp1		    // (b&c)|(b&d)
-	add	$f=$f,tmp4		    // f+=e+K_40_59
+{ .mib;	and	tmp1=$c,$d
+	xor	tmp0=$c,$d		};;
+{ .mmi;	add	$e=$e,$X[$i%16]		    // e+=Xupdate
+	add	tmp5=tmp5,tmp1		    // a<<5+(c&d)
 	extr.u	tmp1=$a,27,5		}   // a>>27
-{ .mmi;	and	tmp4=$c,$d
-	xor	tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf]	// +1
-	xor	tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf]	// +1
-	};;
-{ .mmi;	or	tmp1=tmp1,tmp5		    // ROTATE(a,5)
-	xor	tmp2=tmp2,tmp3		    // +1
+{ .mmi;	and	tmp0=tmp0,$b
+	xor	$Xn=$Xn,$X[($j+2)%16]	    // forward Xupdate
+	xor	tmp3=$X[($j+8)%16],$X[($j+13)%16] };;	// forward Xupdate
+{ .mmi;	add	$e=$e,tmp0		    // e+=b&(c^d)
+	add	tmp5=tmp5,tmp1		    // ROTATE(a,5)+(c&d)
 	shrp	$b=tmp6,tmp6,2		}   // b=ROTATE(b,30)
-{ .mmi;	or	tmp0=tmp0,tmp4		    // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d)
+{ .mmi;	xor	$Xn=$Xn,tmp3
 	mux2	tmp6=$a,0x44		};; // see b in next iteration
-{ .mii;	add	$f=$f,tmp0		    // f+=F_40_59(b,c,d)
-	shrp	$e=tmp2,tmp2,31;;	    // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1)
-	add	$f=$f,tmp1		};; // f+=ROTATE(a,5)
+{ .mii;	add	$e=$e,tmp5		    // e+=ROTATE(a,5)+(c&d)
+	shrp	$Xn=$Xn,$Xn,31		    // ROTATE(x[0]^x[2]^x[8]^x[13],1)
+	nop.i	0x0			};;
 
 ___
 }
@@ -237,7 +235,7 @@ inp=r33;	// in1
 .align	32
 sha1_block_data_order:
 	.prologue
-{ .mmi;	alloc	tmp1=ar.pfs,3,15,0,0
+{ .mmi;	alloc	tmp1=ar.pfs,3,14,0,0
 	$ADDP	tmp0=4,ctx
 	.save	ar.lc,r3
 	mov	r3=ar.lc		}
@@ -245,8 +243,8 @@ sha1_block_data_order:
 	$ADDP	inp=0,inp
 	mov	r2=pr			};;
 tmp4=in2;
-tmp5=loc13;
-tmp6=loc14;
+tmp5=loc12;
+tmp6=loc13;
 	.body
 { .mlx;	ld4	$h0=[ctx],8
 	movl	$K_00_19=0x5a827999	}
@@ -273,7 +271,7 @@ tmp6=loc14;
 
 ___
 
-{ my $i,@V=($A,$B,$C,$D,$E,$T);
+{ my $i,@V=($A,$B,$C,$D,$E);
 
 	for($i=0;$i<16;$i++)	{ &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<20;$i++)	{ &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); }
@@ -281,12 +279,12 @@ ___
 	for(;$i<60;$i++)	{ &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); }
 	for(;$i<80;$i++)	{ &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); }
 
-	(($V[5] eq $D) and ($V[0] eq $E)) or die;	# double-check
+	(($V[0] eq $A) and ($V[4] eq $E)) or die;	# double-check
 }
 
 $code.=<<___;
-{ .mmb;	add	$h0=$h0,$E
-	nop.m	0
+{ .mmb;	add	$h0=$h0,$A
+	add	$h2=$h2,$C
 	br.ctop.dptk.many	.Ldtop	};;
 .Ldend:
 { .mmi;	add	tmp0=4,ctx
diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl
new file mode 100644
index 0000000000..f1a702f38f
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-mips.pl
@@ -0,0 +1,354 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for MIPS.
+
+# Performance improvement is 30% on unaligned input. The "secret" is
+# to deploy lwl/lwr pair to load unaligned input. One could have
+# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32-
+# compatible subroutine. There is room for minor optimization on
+# little-endian platforms...
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$PTR_SLL="dsll";	# incidentally works even on n32
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$PTR_SLL="sll";
+	$SZREG=4;
+}
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);   }
+open STDOUT,">$output";
+
+if (!defined($big_endian))
+            {   $big_endian=(unpack('L',pack('N',1))==1);   }
+
+# offsets of the Most and Least Significant Bytes
+$MSB=$big_endian?0:3;
+$LSB=3&~$MSB;
+
+@X=map("\$$_",(8..23));	# a4-a7,s0-s11
+
+$ctx=$a0;
+$inp=$a1;
+$num=$a2;
+$A="\$1";
+$B="\$2";
+$C="\$3";
+$D="\$7";
+$E="\$24";	@V=($A,$B,$C,$D,$E);
+$t0="\$25";
+$t1=$num;	# $num is offloaded to stack
+$t2="\$30";	# fp
+$K="\$31";	# ra
+
+sub BODY_00_14 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___	if (!$big_endian);
+	srl	$t0,@X[$i],24	# byte swap($i)
+	srl	$t1,@X[$i],8
+	andi	$t2,@X[$i],0xFF00
+	sll	@X[$i],@X[$i],24
+	andi	$t1,0xFF00
+	sll	$t2,$t2,8
+	or	@X[$i],$t0
+	or	$t1,$t2
+	or	@X[$i],$t1
+___
+$code.=<<___;
+	 lwl	@X[$j],$j*4+$MSB($inp)
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	 lwr	@X[$j],$j*4+$LSB($inp)
+	srl	$t1,$a,27
+	addu	$e,$t0
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	sll	$t2,$b,30
+	and	$t0,$b
+	srl	$b,$b,2
+	xor	$t0,$d
+	addu	$e,@X[$i]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+sub BODY_15_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+
+$code.=<<___	if (!$big_endian && $i==15);
+	srl	$t0,@X[$i],24	# byte swap($i)
+	srl	$t1,@X[$i],8
+	andi	$t2,@X[$i],0xFF00
+	sll	@X[$i],@X[$i],24
+	andi	$t1,0xFF00
+	sll	$t2,$t2,8
+	or	@X[$i],$t0
+	or	@X[$i],$t1
+	or	@X[$i],$t2
+___
+$code.=<<___;
+	 xor	@X[$j%16],@X[($j+2)%16]
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 xor	@X[$j%16],@X[($j+8)%16]
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	 xor	@X[$j%16],@X[($j+13)%16]
+	sll	$t2,$b,30
+	and	$t0,$b
+	 srl	$t1,@X[$j%16],31
+	 addu	@X[$j%16],@X[$j%16]
+	srl	$b,$b,2
+	xor	$t0,$d
+	 or	@X[$j%16],$t1
+	addu	$e,@X[$i%16]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	 xor	@X[$j%16],@X[($j+2)%16]
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 xor	@X[$j%16],@X[($j+8)%16]
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	 xor	@X[$j%16],@X[($j+13)%16]
+	sll	$t2,$b,30
+	xor	$t0,$b
+	 srl	$t1,@X[$j%16],31
+	 addu	@X[$j%16],@X[$j%16]
+	srl	$b,$b,2
+	addu	$e,@X[$i%16]
+	 or	@X[$j%16],$t1
+	or	$b,$t2
+	addu	$e,$t0
+___
+$code.=<<___ if ($i==79);
+	 lw	@X[0],0($ctx)
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	 lw	@X[1],4($ctx)
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 lw	@X[2],8($ctx)
+	xor	$t0,$c,$d
+	addu	$e,$t1
+	 lw	@X[3],12($ctx)
+	sll	$t2,$b,30
+	xor	$t0,$b
+	 lw	@X[4],16($ctx)
+	srl	$b,$b,2
+	addu	$e,@X[$i%16]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	 xor	@X[$j%16],@X[($j+2)%16]
+	sll	$t0,$a,5	# $i
+	addu	$e,$K
+	srl	$t1,$a,27
+	addu	$e,$t0
+	 xor	@X[$j%16],@X[($j+8)%16]
+	and	$t0,$c,$d
+	addu	$e,$t1
+	 xor	@X[$j%16],@X[($j+13)%16]
+	sll	$t2,$b,30
+	addu	$e,$t0
+	 srl	$t1,@X[$j%16],31
+	xor	$t0,$c,$d
+	 addu	@X[$j%16],@X[$j%16]
+	and	$t0,$b
+	srl	$b,$b,2
+	 or	@X[$j%16],$t1
+	addu	$e,@X[$i%16]
+	or	$b,$t2
+	addu	$e,$t0
+___
+}
+
+$FRAMESIZE=16;	# large enough to accomodate NUBI saved registers
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+.align	5
+.globl	sha1_block_data_order
+.ent	sha1_block_data_order
+sha1_block_data_order:
+	.frame	$sp,$FRAMESIZE*$SZREG,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
+	$REG_S	$ra,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_S	$fp,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_S	$s11,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_S	$s10,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_S	$s9,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_S	$s8,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_S	$s7,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_S	$s6,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_S	$s5,($FRAMESIZE-9)*$SZREG($sp)
+	$REG_S	$s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_S	$s2,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_S	$s1,($FRAMESIZE-13)*$SZREG($sp)
+	$REG_S	$s0,($FRAMESIZE-14)*$SZREG($sp)
+	$REG_S	$gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+	$PTR_SLL $num,6
+	$PTR_ADD $num,$inp
+	$REG_S	$num,0($sp)
+	lw	$A,0($ctx)
+	lw	$B,4($ctx)
+	lw	$C,8($ctx)
+	lw	$D,12($ctx)
+	b	.Loop
+	lw	$E,16($ctx)
+.align	4
+.Loop:
+	.set	reorder
+	lwl	@X[0],$MSB($inp)
+	lui	$K,0x5a82
+	lwr	@X[0],$LSB($inp)
+	ori	$K,0x7999	# K_00_19
+___
+for ($i=0;$i<15;$i++)	{ &BODY_00_14($i,@V); unshift(@V,pop(@V)); }
+for (;$i<20;$i++)	{ &BODY_15_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	lui	$K,0x6ed9
+	ori	$K,0xeba1	# K_20_39
+___
+for (;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	lui	$K,0x8f1b
+	ori	$K,0xbcdc	# K_40_59
+___
+for (;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	lui	$K,0xca62
+	ori	$K,0xc1d6	# K_60_79
+___
+for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	$PTR_ADD $inp,64
+	$REG_L	$num,0($sp)
+
+	addu	$A,$X[0]
+	addu	$B,$X[1]
+	sw	$A,0($ctx)
+	addu	$C,$X[2]
+	addu	$D,$X[3]
+	sw	$B,4($ctx)
+	addu	$E,$X[4]
+	sw	$C,8($ctx)
+	sw	$D,12($ctx)
+	sw	$E,16($ctx)
+	.set	noreorder
+	bne	$inp,$num,.Loop
+	nop
+
+	.set	noreorder
+	$REG_L	$ra,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_L	$fp,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_L	$s11,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_L	$s10,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_L	$s9,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_L	$s8,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_L	$s7,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_L	$s6,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_L	$s5,($FRAMESIZE-9)*$SZREG($sp)
+	$REG_L	$s4,($FRAMESIZE-10)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_L	$s2,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_L	$s1,($FRAMESIZE-13)*$SZREG($sp)
+	$REG_L	$s0,($FRAMESIZE-14)*$SZREG($sp)
+	$REG_L	$gp,($FRAMESIZE-15)*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end	sha1_block_data_order
+.rdata
+.asciiz	"SHA1 for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
new file mode 100644
index 0000000000..6d7bf495b2
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha1-parisc.pl
@@ -0,0 +1,259 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for PA-RISC.
+
+# June 2009.
+#
+# On PA-7100LC performance is >30% better than gcc 3.2 generated code
+# for aligned input and >50% better for unaligned. Compared to vendor
+# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
+# few percent faster in 32-bit one (this for aligned input, data for
+# unaligned input is not available).
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
+				#                 [+ argument transfer]
+$ctx="%r26";		# arg0
+$inp="%r25";		# arg1
+$num="%r24";		# arg2
+
+$t0="%r28";
+$t1="%r29";
+$K="%r31";
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
+
+@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<15);
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	addl	@X[$i],$e,$e
+	and	$c,$b,$t0
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i>=15);	# with forward Xupdate
+	addl	$K,$e,$e	; $i
+	shd	$a,$a,27,$t1
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	and	$c,$b,$t0
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	andcm	$d,$b,$t1
+	shd	$b,$b,2,$b
+	or	$t1,$t0,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	add	$t0,$e,$e
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t0,$e,$e
+___
+$code.=<<___ if ($i==79);	# with context load
+	ldw	0($ctx),@X[0]	; $i
+	addl	$K,$e,$e
+	shd	$a,$a,27,$t1
+	ldw	4($ctx),@X[1]
+	addl	@X[$i%16],$e,$e
+	xor	$b,$c,$t0
+	ldw	8($ctx),@X[2]
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	$d,$t0,$t0
+	ldw	12($ctx),@X[3]
+	addl	$t0,$e,$e
+	ldw	16($ctx),@X[4]
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;
+	shd	$a,$a,27,$t1	; $i
+	addl	$K,$e,$e
+	xor	@X[($j+2)%16],@X[$j%16],@X[$j%16]
+	xor	$d,$c,$t0
+	addl	@X[$i%16],$e,$e
+	xor	@X[($j+8)%16],@X[$j%16],@X[$j%16]
+	and	$b,$t0,$t0
+	addl	$t1,$e,$e
+	shd	$b,$b,2,$b
+	xor	@X[($j+13)%16],@X[$j%16],@X[$j%16]
+	addl	$t0,$e,$e
+	and	$d,$c,$t1
+	shd	@X[$j%16],@X[$j%16],31,@X[$j%16]
+	addl	$t1,$e,$e
+___
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+sha1_block_data_order
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+
+	ldw	0($ctx),$A
+	ldw	4($ctx),$B
+	ldw	8($ctx),$C
+	ldw	12($ctx),$D
+	ldw	16($ctx),$E
+
+	extru	$inp,31,2,$t0		; t0=inp&3;
+	sh3addl	$t0,%r0,$t0		; t0*=8;
+	subi	32,$t0,$t0		; t0=32-t0;
+	mtctl	$t0,%cr11		; %sar=t0;
+
+L\$oop
+	ldi	3,$t0
+	andcm	$inp,$t0,$t0		; 64-bit neutral
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\tldw	`4*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	ldw	60($t0),@X[15]
+	ldw	64($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align input
+	$code.="\tvshd	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	ldil	L'0x5a827000,$K		; K_00_19
+	ldo	0x999($K),$K
+___
+for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x6ed9e000,$K		; K_20_39
+	ldo	0xba1($K),$K
+___
+
+for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0x8f1bb000,$K		; K_40_59
+	ldo	0xcdc($K),$K
+___
+
+for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	ldil	L'0xca62c000,$K		; K_60_79
+	ldo	0x1d6($K),$K
+___
+for (;$i<80;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	addl	@X[0],$A,$A
+	addl	@X[1],$B,$B
+	addl	@X[2],$C,$C
+	addl	@X[3],$D,$D
+	addl	@X[4],$E,$E
+	stw	$A,0($ctx)
+	stw	$B,4($ctx)
+	stw	$C,8($ctx)
+	stw	$D,12($ctx)
+	stw	$E,16($ctx)
+	addib,*<> -1,$num,L\$oop
+	ldo	64($inp),$inp
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/,\*/,/gm if ($SIZE_T==4);
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
index dcd0fcdfcf..2140dd2f8d 100755
--- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl
@@ -24,12 +24,14 @@ $flavour = shift;
 
 if ($flavour =~ /64/) {
 	$SIZE_T	=8;
+	$LRSAVE	=2*$SIZE_T;
 	$UCMP	="cmpld";
 	$STU	="stdu";
 	$POP	="ld";
 	$PUSH	="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T	=4;
+	$LRSAVE	=$SIZE_T;
 	$UCMP	="cmplw";
 	$STU	="stwu";
 	$POP	="lwz";
@@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
 
 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
-$FRAME=24*$SIZE_T;
+$FRAME=24*$SIZE_T+64;
+$LOCALS=6*$SIZE_T;
 
 $K  ="r0";
 $sp ="r1";
@@ -162,9 +165,8 @@ $code=<<___;
 .globl	.sha1_block_data_order
 .align	4
 .sha1_block_data_order:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+64)`($sp)
-	$PUSH	r0,`$FRAME-$SIZE_T*18`($sp)
 	$PUSH	r15,`$FRAME-$SIZE_T*17`($sp)
 	$PUSH	r16,`$FRAME-$SIZE_T*16`($sp)
 	$PUSH	r17,`$FRAME-$SIZE_T*15`($sp)
@@ -182,6 +184,7 @@ $code=<<___;
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 	lwz	$A,0($ctx)
 	lwz	$B,4($ctx)
 	lwz	$C,8($ctx)
@@ -192,37 +195,14 @@ $code=<<___;
 Laligned:
 	mtctr	$num
 	bl	Lsha1_block_private
-Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*18`($sp)
-	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
-	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
-	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
-	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
-	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
-	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
-	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
-	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
-	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
-	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
-	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
-	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
-	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
-	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
-	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
-	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
-	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
-	mtlr	r0
-	addi	$sp,$sp,`$FRAME+64`
-	blr
-___
+	b	Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for 64-byte input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for 64-byte input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align	4
 Lunaligned:
 	subfic	$t1,$inp,4096
@@ -237,7 +217,7 @@ Lunaligned:
 Lcross_page:
 	li	$t1,16
 	mtctr	$t1
-	addi	r20,$sp,$FRAME	; spot below the frame
+	addi	r20,$sp,$LOCALS	; spot within the frame
 Lmemcpy:
 	lbz	r16,0($inp)
 	lbz	r17,1($inp)
@@ -251,15 +231,40 @@ Lmemcpy:
 	addi	r20,r20,4
 	bdnz	Lmemcpy
 
-	$PUSH	$inp,`$FRAME-$SIZE_T*19`($sp)
+	$PUSH	$inp,`$FRAME-$SIZE_T*18`($sp)
 	li	$t1,1
-	addi	$inp,$sp,$FRAME
+	addi	$inp,$sp,$LOCALS
 	mtctr	$t1
 	bl	Lsha1_block_private
-	$POP	$inp,`$FRAME-$SIZE_T*19`($sp)
+	$POP	$inp,`$FRAME-$SIZE_T*18`($sp)
 	addic.	$num,$num,-1
 	bne-	Lunaligned
-	b	Ldone
+
+Ldone:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
 ___
 
 # This is private block function, which uses tailored calling
@@ -309,6 +314,8 @@ $code.=<<___;
 	addi	$inp,$inp,`16*4`
 	bdnz-	Lsha1_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 $code.=<<___;
 .asciz	"SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
index 4b17848287..9193dda45e 100644
--- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl
@@ -21,9 +21,28 @@
 # instructions to favour dual-issue z10 pipeline. On z10 hardware is
 # "only" ~2.3x faster than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
+
 $kimdfunc=1;	# magic function code for kimd instruction
 
-$output=shift;
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $K_00_39="%r0"; $K=$K_00_39;
@@ -42,13 +61,14 @@ $t1="%r11";
 @X=("%r12","%r13","%r14");
 $sp="%r15";
 
-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;
 
 sub Xupdate {
 my $i=shift;
 
 $code.=<<___ if ($i==15);
-	lg	$prefetch,160($sp)	### Xupdate(16) warm-up
+	lg	$prefetch,$stdframe($sp)	### Xupdate(16) warm-up
 	lr	$X[0],$X[2]
 ___
 return if ($i&1);	# Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@ $code.=<<___ if ($i<16);
 ___
 $code.=<<___ if ($i>=16);
 	xgr	$X[0],$prefetch		### Xupdate($i)
-	lg	$prefetch,`160+4*(($i+2)%16)`($sp)
-	xg	$X[0],`160+4*(($i+8)%16)`($sp)
+	lg	$prefetch,`$stdframe+4*(($i+2)%16)`($sp)
+	xg	$X[0],`$stdframe+4*(($i+8)%16)`($sp)
 	xgr	$X[0],$prefetch
 	rll	$X[0],$X[0],1
 	rllg	$X[1],$X[0],32
@@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16);
 	lr	$X[2],$X[1]		# feedback
 ___
 $code.=<<___ if ($i<=70);
-	stg	$X[0],`160+4*($i%16)`($sp)
+	stg	$X[0],`$stdframe+4*($i%16)`($sp)
 ___
 unshift(@X,pop(@X));
 }
@@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc);
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
@@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc);
 ___
 $code.=<<___;
 	lghi	%r1,-$frame
-	stg	$ctx,16($sp)
-	stmg	%r6,%r15,48($sp)
+	st${g}	$ctx,`2*$SIZE_T`($sp)
+	stm${g}	%r6,%r15,`6*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$t0,Ktable
 	llgf	$A,0($ctx)
@@ -199,7 +219,7 @@ ___
 for (;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,64($inp)
 	al	$A,0($ctx)
 	al	$B,4($ctx)
@@ -211,13 +231,13 @@ $code.=<<___;
 	st	$C,8($ctx)
 	st	$D,12($ctx)
 	st	$E,16($ctx)
-	brct	$len,.Lloop
+	brct${g} $len,.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)
 	br	%r14
 .size	sha1_block_data_order,.-sha1_block_data_order
 .string	"SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
index 4edc5ea9ad..f27c1e3fb0 100755
--- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl
@@ -16,7 +16,7 @@
 # There was suggestion to mechanically translate 32-bit code, but I
 # dismissed it, reasoning that x86_64 offers enough register bank
 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
-# implementation:-) However! While 64-bit code does performs better
+# implementation:-) However! While 64-bit code does perform better
 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
 # x86_64 does offer larger *addressable* bank, but out-of-order core
 # reaches for even more registers through dynamic aliasing, and EM64T
@@ -29,6 +29,38 @@
 # Xeon P4	+65%		+0%		9.9
 # Core2		+60%		+10%		7.0
 
+# August 2009.
+#
+# The code was revised to minimize code size and to maximize
+# "distance" between instructions producing input to 'lea'
+# instruction and the 'lea' instruction itself, which is essential
+# for Intel Atom core.
+
+# October 2010.
+#
+# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
+# is to offload message schedule denoted by Wt in NIST specification,
+# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
+# for background and implementation details. The only difference from
+# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
+# to free temporary registers.
+
+# April 2011.
+#
+# Add AVX code path. See sha1-586.pl for further information.
+
+######################################################################
+# Current performance is summarized in following table. Numbers are
+# CPU clock cycles spent to process single byte (less is better).
+#
+#		x86_64		SSSE3		AVX
+# P4		9.8		-
+# Opteron	6.6		-
+# Core2		6.7		6.1/+10%	-
+# Atom		11.0		9.7/+13%	-
+# Westmere	7.1		5.6/+27%	-
+# Sandy Bridge	7.9		6.3/+25%	5.2/+51%
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -40,6 +72,16 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";
 
+$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.19);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	   `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
+	   $1>=2.09);
+$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	   `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
+	   $1>=10);
+
 open STDOUT,"| $^X $xlate $flavour $output";
 
 $ctx="%rdi";	# 1st arg
@@ -51,196 +93,994 @@ $ctx="%r8";
 $inp="%r9";
 $num="%r10";
 
-$xi="%eax";
-$t0="%ebx";
-$t1="%ecx";
-$A="%edx";
-$B="%esi";
-$C="%edi";
-$D="%ebp";
-$E="%r11d";
-$T="%r12d";
-
-@V=($A,$B,$C,$D,$E,$T);
+$t0="%eax";
+$t1="%ebx";
+$t2="%ecx";
+@xi=("%edx","%ebp");
+$A="%esi";
+$B="%edi";
+$C="%r11d";
+$D="%r12d";
+$E="%r13d";
 
-sub PROLOGUE {
-my $func=shift;
-$code.=<<___;
-.globl	$func
-.type	$func,\@function,3
-.align	16
-$func:
-	push	%rbx
-	push	%rbp
-	push	%r12
-	mov	%rsp,%r11
-	mov	%rdi,$ctx	# reassigned argument
-	sub	\$`8+16*4`,%rsp
-	mov	%rsi,$inp	# reassigned argument
-	and	\$-64,%rsp
-	mov	%rdx,$num	# reassigned argument
-	mov	%r11,`16*4`(%rsp)
-.Lprologue:
-
-	mov	0($ctx),$A
-	mov	4($ctx),$B
-	mov	8($ctx),$C
-	mov	12($ctx),$D
-	mov	16($ctx),$E
-___
-}
-
-sub EPILOGUE {
-my $func=shift;
-$code.=<<___;
-	mov	`16*4`(%rsp),%rsi
-	mov	(%rsi),%r12
-	mov	8(%rsi),%rbp
-	mov	16(%rsi),%rbx
-	lea	24(%rsi),%rsp
-.Lepilogue:
-	ret
-.size	$func,.-$func
-___
-}
+@V=($A,$B,$C,$D,$E);
 
 sub BODY_00_19 {
-my ($i,$a,$b,$c,$d,$e,$f,$host)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___ if ($i==0);
-	mov	`4*$i`($inp),$xi	
-	`"bswap	$xi"	if(!defined($host))`
-	mov	$xi,`4*$i`(%rsp)
+	mov	`4*$i`($inp),$xi[0]
+	bswap	$xi[0]
+	mov	$xi[0],`4*$i`(%rsp)
 ___
 $code.=<<___ if ($i<15);
-	lea	0x5a827999($xi,$e),$f
 	mov	$c,$t0
-	mov	`4*$j`($inp),$xi
-	mov	$a,$e
+	mov	`4*$j`($inp),$xi[1]
+	mov	$a,$t2
 	xor	$d,$t0
-	`"bswap	$xi"	if(!defined($host))`	
-	rol	\$5,$e
+	bswap	$xi[1]
+	rol	\$5,$t2
+	lea	0x5a827999($xi[0],$e),$e
 	and	$b,$t0
-	mov	$xi,`4*$j`(%rsp)
-	add	$e,$f
+	mov	$xi[1],`4*$j`(%rsp)
+	add	$t2,$e
 	xor	$d,$t0
 	rol	\$30,$b
-	add	$t0,$f
+	add	$t0,$e
 ___
 $code.=<<___ if ($i>=15);
-	lea	0x5a827999($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
+	mov	`4*($j%16)`(%rsp),$xi[1]
 	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
+	rol	\$5,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
 	and	$b,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
+	lea	0x5a827999($xi[0],$e),$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
+	rol	\$1,$xi[1]
+	add	$t2,$e
 	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
-	mov	$xi,`4*($j%16)`(%rsp)
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t0,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
 sub BODY_20_39 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 $code.=<<___ if ($i<79);
-	lea	$K($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
+	mov	`4*($j%16)`(%rsp),$xi[1]
 	mov	$c,$t0
-	mov	$a,$e
-	xor	`4*(($j+2)%16)`(%rsp),$xi
+	mov	$a,$t2
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
 	xor	$b,$t0
-	rol	\$5,$e
-	xor	`4*(($j+8)%16)`(%rsp),$xi
+	rol	\$5,$t2
+	lea	$K($xi[0],$e),$e
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
 	xor	$d,$t0
-	add	$e,$f
-	xor	`4*(($j+13)%16)`(%rsp),$xi
+	add	$t2,$e
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
 	rol	\$30,$b
-	add	$t0,$f
-	rol	\$1,$xi
+	add	$t0,$e
+	rol	\$1,$xi[1]
 ___
 $code.=<<___ if ($i<76);
-	mov	$xi,`4*($j%16)`(%rsp)
+	mov	$xi[1],`4*($j%16)`(%rsp)
 ___
 $code.=<<___ if ($i==79);
-	lea	$K($xi,$e),$f
 	mov	$c,$t0
-	mov	$a,$e
+	mov	$a,$t2
 	xor	$b,$t0
-	rol	\$5,$e
+	lea	$K($xi[0],$e),$e
+	rol	\$5,$t2
 	xor	$d,$t0
-	add	$e,$f
+	add	$t2,$e
 	rol	\$30,$b
-	add	$t0,$f
+	add	$t0,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
 sub BODY_40_59 {
-my ($i,$a,$b,$c,$d,$e,$f)=@_;
+my ($i,$a,$b,$c,$d,$e)=@_;
 my $j=$i+1;
 $code.=<<___;
-	lea	0x8f1bbcdc($xi,$e),$f
-	mov	`4*($j%16)`(%rsp),$xi
-	mov	$b,$t0
-	mov	$b,$t1
-	xor	`4*(($j+2)%16)`(%rsp),$xi
-	mov	$a,$e
-	and	$c,$t0
-	xor	`4*(($j+8)%16)`(%rsp),$xi
-	or	$c,$t1
-	rol	\$5,$e
-	xor	`4*(($j+13)%16)`(%rsp),$xi
-	and	$d,$t1
-	add	$e,$f
-	rol	\$1,$xi
-	or	$t1,$t0
+	mov	`4*($j%16)`(%rsp),$xi[1]
+	mov	$c,$t0
+	mov	$c,$t1
+	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
+	and	$d,$t0
+	mov	$a,$t2
+	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
+	xor	$d,$t1
+	lea	0x8f1bbcdc($xi[0],$e),$e
+	rol	\$5,$t2
+	xor	`4*(($j+13)%16)`(%rsp),$xi[1]
+	add	$t0,$e
+	and	$b,$t1
+	rol	\$1,$xi[1]
+	add	$t1,$e
 	rol	\$30,$b
-	mov	$xi,`4*($j%16)`(%rsp)
-	add	$t0,$f
+	mov	$xi[1],`4*($j%16)`(%rsp)
+	add	$t2,$e
 ___
+unshift(@xi,pop(@xi));
 }
 
-$code=".text\n";
+$code.=<<___;
+.text
+.extern	OPENSSL_ia32cap_P
 
-&PROLOGUE("sha1_block_data_order");
-$code.=".align	4\n.Lloop:\n";
+.globl	sha1_block_data_order
+.type	sha1_block_data_order,\@function,3
+.align	16
+sha1_block_data_order:
+	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
+	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
+	test	\$`1<<9`,%r8d		# check SSSE3 bit
+	jz	.Lialu
+___
+$code.=<<___ if ($avx);
+	and	\$`1<<28`,%r8d		# mask AVX bit
+	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
+	or	%r9d,%r8d
+	cmp	\$`1<<28|1<<30`,%r8d
+	je	_avx_shortcut
+___
+$code.=<<___;
+	jmp	_ssse3_shortcut
+
+.align	16
+.Lialu:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	mov	%rsp,%r11
+	mov	%rdi,$ctx	# reassigned argument
+	sub	\$`8+16*4`,%rsp
+	mov	%rsi,$inp	# reassigned argument
+	and	\$-64,%rsp
+	mov	%rdx,$num	# reassigned argument
+	mov	%r11,`16*4`(%rsp)
+.Lprologue:
+
+	mov	0($ctx),$A
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	16($ctx),$E
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+___
 for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 $code.=<<___;
-	add	0($ctx),$E
-	add	4($ctx),$T
-	add	8($ctx),$A
-	add	12($ctx),$B
-	add	16($ctx),$C
-	mov	$E,0($ctx)
-	mov	$T,4($ctx)
-	mov	$A,8($ctx)
-	mov	$B,12($ctx)
-	mov	$C,16($ctx)
-
-	xchg	$E,$A	# mov	$E,$A
-	xchg	$T,$B	# mov	$T,$B
-	xchg	$E,$C	# mov	$A,$C
-	xchg	$T,$D	# mov	$B,$D
-			# mov	$C,$E
-	lea	`16*4`($inp),$inp
+	add	0($ctx),$A
+	add	4($ctx),$B
+	add	8($ctx),$C
+	add	12($ctx),$D
+	add	16($ctx),$E
+	mov	$A,0($ctx)
+	mov	$B,4($ctx)
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+
 	sub	\$1,$num
+	lea	`16*4`($inp),$inp
 	jnz	.Lloop
+
+	mov	`16*4`(%rsp),%rsi
+	mov	(%rsi),%r13
+	mov	8(%rsi),%r12
+	mov	16(%rsi),%rbp
+	mov	24(%rsi),%rbx
+	lea	32(%rsi),%rsp
+.Lepilogue:
+	ret
+.size	sha1_block_data_order,.-sha1_block_data_order
 ___
-&EPILOGUE("sha1_block_data_order");
+{{{
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &rol(@_) };
+my $_ror=sub { &ror(@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_ssse3,\@function,3
+.align	16
+sha1_block_data_order_ssse3:
+_ssse3_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_ssse3:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	movdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	movdqu	16($inp),@X[-3&7]
+	movdqu	32($inp),@X[-2&7]
+	movdqu	48($inp),@X[-1&7]
+	pshufb	@X[2],@X[-4&7]		# byte swap
+	add	\$64,$inp
+	pshufb	@X[2],@X[-3&7]
+	pshufb	@X[2],@X[-2&7]
+	pshufb	@X[2],@X[-1&7]
+	paddd	@Tx[1],@X[-4&7]		# add K_00_19
+	paddd	@Tx[1],@X[-3&7]
+	paddd	@Tx[1],@X[-2&7]
+	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
+	psubd	@Tx[1],@X[-4&7]		# restore X[]
+	movdqa	@X[-3&7],16(%rsp)
+	psubd	@Tx[1],@X[-3&7]
+	movdqa	@X[-2&7],32(%rsp)
+	psubd	@Tx[1],@X[-2&7]
+	jmp	.Loop_ssse3
+___
+
+sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
+  my $arg = pop;
+    $arg = "\$$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
+}
+
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@X[0],@X[-3&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[0],@X[-1&7]);
+	&palignr(@X[0],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&movdqa	(@Tx[2],@X[0]);
+	&movdqa	(@Tx[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
+	&paddd	(@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(@Tx[1],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&psrld	(@Tx[2],30);
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pslld	(@Tx[1],2);
+	&pxor	(@X[0],@Tx[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_ssse3_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&movdqa	(@Tx[0],@X[-1&7])	if ($Xi==8);
+	 eval(shift(@insns));		# body_20_39
+	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
+	&palignr(@Tx[0],@X[-2&7],8);	# compose "X[-6]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &movdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&movdqa	(@Tx[0],@X[0]);
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&pslld	(@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	&psrld	(@Tx[0],30);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &movdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_ssse3_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &paddd	(@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_ssse3");
+
+	unshift(@Tx,pop(@Tx));
+
+	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&movdqa	(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&movdqu	(@X[-4&7],"0($inp)");		# load input
+	&movdqu	(@X[-3&7],"16($inp)");
+	&movdqu	(@X[-2&7],"32($inp)");
+	&movdqu	(@X[-1&7],"48($inp)");
+	&pshufb	(@X[-4&7],@X[2]);		# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&pshufb	(@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&paddd	(@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&psubd	(@X[($Xi-4)&7],@Tx[1]);
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_ssse3()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+sub body_00_19 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	($c,$d);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&xor	($c,$d);',	# restore $c
+	'&xor	(@T[0],$d);',
+	'&add	($e,$a);',
+	'&$_ror	($b,$j?7:2);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_20_39 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&xor	(@T[0],$d);',	# ($b^$d)
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&xor	(@T[0],$c);',	# ($b^$d^$c)
+	'&add	($e,$a);',
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[0]);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
+
+sub body_40_59 () {
+	(
+	'($a,$b,$c,$d,$e)=@V;'.
+	'&mov	(@T[1],$c);',
+	'&xor	($c,$d);',
+	'&add	($e,eval(4*($j++&15))."(%rsp)");',	# X[]+K xfer
+	'&and	(@T[1],$d);',
+	'&and	(@T[0],$c);',	# ($b&($c^$d))
+	'&$_ror	($b,7);',	# $b>>>2
+	'&add	($e,@T[1]);',
+	'&mov	(@T[1],$a);',	# $b in next round
+	'&$_rol	($a,5);',
+	'&add	($e,@T[0]);',
+	'&xor	($c,$d);',	# restore $c
+	'&add	($e,$a);'	.'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
+	);
+}
 $code.=<<___;
-.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align	16
+.Loop_ssse3:
+___
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_16_31(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_00_19);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_40_59);
+	&Xupdate_ssse3_32_79(\&body_20_39);
+	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+	&Xloop_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_ssse3
+
+.align	16
+.Ldone_ssse3:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+	&Xtail_ssse3(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_ssse3:
+	ret
+.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
+___
+
+if ($avx) {
+my $Xi=4;
+my @X=map("%xmm$_",(4..7,0..3));
+my @Tx=map("%xmm$_",(8..10));
+my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
+my @T=("%esi","%edi");
+my $j=0;
+my $K_XX_XX="%r11";
+
+my $_rol=sub { &shld(@_[0],@_) };
+my $_ror=sub { &shrd(@_[0],@_) };
+
+$code.=<<___;
+.type	sha1_block_data_order_avx,\@function,3
+.align	16
+sha1_block_data_order_avx:
+_avx_shortcut:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	lea	`-64-($win64?5*16:0)`(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	movaps	%xmm6,64+0(%rsp)
+	movaps	%xmm7,64+16(%rsp)
+	movaps	%xmm8,64+32(%rsp)
+	movaps	%xmm9,64+48(%rsp)
+	movaps	%xmm10,64+64(%rsp)
+.Lprologue_avx:
+___
+$code.=<<___;
+	mov	%rdi,$ctx	# reassigned argument
+	mov	%rsi,$inp	# reassigned argument
+	mov	%rdx,$num	# reassigned argument
+	vzeroall
+
+	shl	\$6,$num
+	add	$inp,$num
+	lea	K_XX_XX(%rip),$K_XX_XX
+
+	mov	0($ctx),$A		# load context
+	mov	4($ctx),$B
+	mov	8($ctx),$C
+	mov	12($ctx),$D
+	mov	$B,@T[0]		# magic seed
+	mov	16($ctx),$E
+
+	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
+	vmovdqa	0($K_XX_XX),@Tx[1]	# K_00_19
+	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
+	vmovdqu	16($inp),@X[-3&7]
+	vmovdqu	32($inp),@X[-2&7]
+	vmovdqu	48($inp),@X[-1&7]
+	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
+	add	\$64,$inp
+	vpshufb	@X[2],@X[-3&7],@X[-3&7]
+	vpshufb	@X[2],@X[-2&7],@X[-2&7]
+	vpshufb	@X[2],@X[-1&7],@X[-1&7]
+	vpaddd	@Tx[1],@X[-4&7],@X[0]	# add K_00_19
+	vpaddd	@Tx[1],@X[-3&7],@X[1]
+	vpaddd	@Tx[1],@X[-2&7],@X[2]
+	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
+	vmovdqa	@X[1],16(%rsp)
+	vmovdqa	@X[2],32(%rsp)
+	jmp	.Loop_avx
+___
+
+sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpsrldq(@Tx[0],@X[-1&7],4);	# "X[-3]", 3 dwords
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[0],@X[0],31);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
+	&vpaddd	(@X[0],@X[0],@X[0]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpsrld	(@Tx[1],@Tx[2],30);
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpslld	(@Tx[2],@Tx[2],2);
+	&vpxor	(@X[0],@X[0],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");	# K_XX_XX
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+
+	 foreach (@insns) { eval; }	# remaining instructions [if any]
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xupdate_avx_32_79()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 48 instructions
+  my ($a,$b,$c,$d,$e);
+
+	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
+	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
+	 eval(shift(@insns));
+	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
+	if ($Xi%5) {
+	  &vmovdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
+	} else {			# ... or load next one
+	  &vmovdqa	(@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
+	}
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+
+	&vpsrld	(@Tx[0],@X[0],30);
+	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpslld	(@X[0],@X[0],2);
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# ror
+	 eval(shift(@insns));
+
+	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
+	 eval(shift(@insns));		# body_20_39
+	 eval(shift(@insns));
+	  &vmovdqa	(@Tx[1],@X[0])	if ($Xi<19);
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));		# rol
+	 eval(shift(@insns));
+
+	 foreach (@insns) { eval; }	# remaining instructions
+
+  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
+		push(@Tx,shift(@Tx));
+}
+
+sub Xuplast_avx_80()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	  &vpaddd	(@Tx[1],@Tx[1],@X[-1&7]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
+
+	 foreach (@insns) { eval; }		# remaining instructions
+
+	&cmp	($inp,$num);
+	&je	(".Ldone_avx");
+
+	unshift(@Tx,pop(@Tx));
+
+	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
+	&vmovdqa(@Tx[1],"0($K_XX_XX)");		# K_00_19
+	&vmovdqu(@X[-4&7],"0($inp)");		# load input
+	&vmovdqu(@X[-3&7],"16($inp)");
+	&vmovdqu(@X[-2&7],"32($inp)");
+	&vmovdqu(@X[-1&7],"48($inp)");
+	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
+	&add	($inp,64);
+
+  $Xi=0;
+}
+
+sub Xloop_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
+	 eval(shift(@insns));
+	 eval(shift(@insns));
+
+	foreach (@insns) { eval; }
+  $Xi++;
+}
+
+sub Xtail_avx()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
+  my ($a,$b,$c,$d,$e);
+
+	foreach (@insns) { eval; }
+}
+
+$code.=<<___;
+.align	16
+.Loop_avx:
+___
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_16_31(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_00_19);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_40_59);
+	&Xupdate_avx_32_79(\&body_20_39);
+	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
+
+				$saved_j=$j; @saved_V=@V;
+
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+	&Xloop_avx(\&body_20_39);
+
+$code.=<<___;
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	add	12($ctx),$D
+	mov	$A,0($ctx)
+	add	16($ctx),$E
+	mov	@T[0],4($ctx)
+	mov	@T[0],$B			# magic seed
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+	jmp	.Loop_avx
+
+.align	16
+.Ldone_avx:
+___
+				$j=$saved_j; @V=@saved_V;
+
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+	&Xtail_avx(\&body_20_39);
+
+$code.=<<___;
+	vzeroall
+
+	add	0($ctx),$A			# update context
+	add	4($ctx),@T[0]
+	add	8($ctx),$C
+	mov	$A,0($ctx)
+	add	12($ctx),$D
+	mov	@T[0],4($ctx)
+	add	16($ctx),$E
+	mov	$C,8($ctx)
+	mov	$D,12($ctx)
+	mov	$E,16($ctx)
+___
+$code.=<<___ if ($win64);
+	movaps	64+0(%rsp),%xmm6
+	movaps	64+16(%rsp),%xmm7
+	movaps	64+32(%rsp),%xmm8
+	movaps	64+48(%rsp),%xmm9
+	movaps	64+64(%rsp),%xmm10
+___
+$code.=<<___;
+	lea	`64+($win64?5*16:0)`(%rsp),%rsi
+	mov	0(%rsi),%r12
+	mov	8(%rsi),%rbp
+	mov	16(%rsi),%rbx
+	lea	24(%rsi),%rsp
+.Lepilogue_avx:
+	ret
+.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
+___
+}
+$code.=<<___;
+.align	64
+K_XX_XX:
+.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
+.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
+.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
+.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
+.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
+___
+}}}
+$code.=<<___;
+.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	64
 ___
 
 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
@@ -272,25 +1112,75 @@ se_handler:
 
 	lea	.Lprologue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<.Lprologue
-	jb	.Lin_prologue
+	jb	.Lcommon_seh_tail
 
 	mov	152($context),%rax	# pull context->Rsp
 
 	lea	.Lepilogue(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
-	jae	.Lin_prologue
+	jae	.Lcommon_seh_tail
 
 	mov	`16*4`(%rax),%rax	# pull saved stack pointer
-	lea	24(%rax),%rax
+	lea	32(%rax),%rax
 
 	mov	-8(%rax),%rbx
 	mov	-16(%rax),%rbp
 	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
 	mov	%rbx,144($context)	# restore context->Rbx
 	mov	%rbp,160($context)	# restore context->Rbp
 	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+
+	jmp	.Lcommon_seh_tail
+.size	se_handler,.-se_handler
 
-.Lin_prologue:
+.type	ssse3_handler,\@abi-omnipotent
+.align	16
+ssse3_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	64(%rax),%rsi
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$10,%ecx
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	`24+64+5*16`(%rax),%rax	# adjust stack pointer
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore cotnext->R12
+
+.Lcommon_seh_tail:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
@@ -328,19 +1218,38 @@ se_handler:
 	pop	%rdi
 	pop	%rsi
 	ret
-.size	se_handler,.-se_handler
+.size	ssse3_handler,.-ssse3_handler
 
 .section	.pdata
 .align	4
 	.rva	.LSEH_begin_sha1_block_data_order
 	.rva	.LSEH_end_sha1_block_data_order
 	.rva	.LSEH_info_sha1_block_data_order
-
+	.rva	.LSEH_begin_sha1_block_data_order_ssse3
+	.rva	.LSEH_end_sha1_block_data_order_ssse3
+	.rva	.LSEH_info_sha1_block_data_order_ssse3
+___
+$code.=<<___ if ($avx);
+	.rva	.LSEH_begin_sha1_block_data_order_avx
+	.rva	.LSEH_end_sha1_block_data_order_avx
+	.rva	.LSEH_info_sha1_block_data_order_avx
+___
+$code.=<<___;
 .section	.xdata
 .align	8
 .LSEH_info_sha1_block_data_order:
 	.byte	9,0,0,0
 	.rva	se_handler
+.LSEH_info_sha1_block_data_order_ssse3:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
+___
+$code.=<<___ if ($avx);
+.LSEH_info_sha1_block_data_order_avx:
+	.byte	9,0,0,0
+	.rva	ssse3_handler
+	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
 ___
 }
 
diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl
index ecc8b69c75..928ec53123 100644
--- a/src/lib/libcrypto/sha/asm/sha256-586.pl
+++ b/src/lib/libcrypto/sha/asm/sha256-586.pl
@@ -14,8 +14,8 @@
 #		Pentium	PIII	P4	AMD K8	Core2
 # gcc		46	36	41	27	26
 # icc		57	33	38	25	23	
-# x86 asm	40	30	35	20	20
-# x86_64 asm(*)	-	-	21	15.8	16.5
+# x86 asm	40	30	33	20	18
+# x86_64 asm(*)	-	-	21	16	16
 #
 # (*) x86_64 assembler performance is presented for reference
 #     purposes.
@@ -48,20 +48,19 @@ sub BODY_00_15() {
     my $in_16_63=shift;
 
 	&mov	("ecx",$E);
-	 &add	($T,&DWP(4*(8+15+16-9),"esp"))	if ($in_16_63);	# T += X[-7]
-	&ror	("ecx",6);
-	&mov	("edi",$E);
-	&ror	("edi",11);
+	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
+	&ror	("ecx",25-11);
 	 &mov	("esi",$Foff);
-	&xor	("ecx","edi");
-	&ror	("edi",25-11);
+	&xor	("ecx",$E);
+	&ror	("ecx",11-6);
 	 &mov	(&DWP(4*(8+15),"esp"),$T)	if ($in_16_63);	# save X[0]
-	&xor	("ecx","edi");	# Sigma1(e)
+	&xor	("ecx",$E);
+	&ror	("ecx",6);	# Sigma1(e)
 	 &mov	("edi",$Goff);
 	&add	($T,"ecx");	# T += Sigma1(e)
-	 &mov	($Eoff,$E);	# modulo-scheduled
 
 	&xor	("esi","edi");
+	 &mov	($Eoff,$E);	# modulo-scheduled
 	 &mov	("ecx",$A);
 	&and	("esi",$E);
 	 &mov	($E,$Doff);	# e becomes d, which is e in next iteration
@@ -69,14 +68,14 @@ sub BODY_00_15() {
 	 &mov	("edi",$A);
 	&add	($T,"esi");	# T += Ch(e,f,g)
 
-	&ror	("ecx",2);
+	&ror	("ecx",22-13);
 	 &add	($T,$Hoff);	# T += h
-	&ror	("edi",13);
+	&xor	("ecx",$A);
+	&ror	("ecx",13-2);
 	 &mov	("esi",$Boff);
-	&xor	("ecx","edi");
-	&ror	("edi",22-13);
+	&xor	("ecx",$A);
+	&ror	("ecx",2);	# Sigma0(a)
 	 &add	($E,$T);	# d += T
-	&xor	("ecx","edi");	# Sigma0(a)
 	 &mov	("edi",$Coff);
 
 	&add	($T,"ecx");	# T += Sigma0(a)
@@ -168,23 +167,22 @@ sub BODY_00_15() {
 &set_label("16_63",16);
 	&mov	("esi",$T);
 	 &mov	("ecx",&DWP(4*(8+15+16-14),"esp"));
-	&shr	($T,3);
-	&ror	("esi",7);
-	&xor	($T,"esi");
 	&ror	("esi",18-7);
 	 &mov	("edi","ecx");
-	&xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("esi",$T);
+	&ror	("esi",7);
+	&shr	($T,3);
 
-	&shr	("ecx",10);
-	 &mov	("esi",&DWP(4*(8+15+16),"esp"));
-	&ror	("edi",17);
-	&xor	("ecx","edi");
 	&ror	("edi",19-17);
-	 &add	($T,"esi");			# T += X[-16]
-	&xor	("edi","ecx")			# sigma1(X[-2])
+	 &xor	($T,"esi");			# T = sigma0(X[-15])
+	&xor	("edi","ecx");
+	&ror	("edi",17);
+	&shr	("ecx",10);
+	 &add	($T,&DWP(4*(8+15+16),"esp"));	# T += X[-16]
+	&xor	("edi","ecx");			# sigma1(X[-2])
 
-	&add	($T,"edi");			# T += sigma1(X[-2])
-	# &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7], moved to BODY_00_15(1)
+	 &add	($T,&DWP(4*(8+15+16-9),"esp"));	# T += X[-7]
+	# &add	($T,"edi");			# T += sigma1(X[-2])
 	# &mov	(&DWP(4*(8+15),"esp"),$T);	# save X[0]
 
 	&BODY_00_15(1);
diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
index 492cb62bc0..9c84e8d93c 100644
--- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl
@@ -18,11 +18,16 @@
 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
 # Cortex A8 core and ~20 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 16%
+# improvement on Cortex A8 core and ~17 cycles per processed byte.
+
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 $ctx="r0";	$t0="r0";
-$inp="r1";
+$inp="r1";	$t3="r1";
 $len="r2";	$t1="r2";
 $T1="r3";
 $A="r4";
@@ -46,6 +51,9 @@ sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___ if ($i<16);
+#if __ARM_ARCH__>=7
+	ldr	$T1,[$inp],#4
+#else
 	ldrb	$T1,[$inp,#3]			@ $i
 	ldrb	$t2,[$inp,#2]
 	ldrb	$t1,[$inp,#1]
@@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
 	orr	$T1,$T1,$t2,lsl#8
 	orr	$T1,$T1,$t1,lsl#16
 	orr	$T1,$T1,$t0,lsl#24
-	`"str	$inp,[sp,#17*4]"	if ($i==15)`
+#endif
 ___
 $code.=<<___;
-	ldr	$t2,[$Ktbl],#4			@ *K256++
 	mov	$t0,$e,ror#$Sigma1[0]
-	str	$T1,[sp,#`$i%16`*4]
+	ldr	$t2,[$Ktbl],#4			@ *K256++
 	eor	$t0,$t0,$e,ror#$Sigma1[1]
 	eor	$t1,$f,$g
+#if $i>=16
+	add	$T1,$T1,$t3			@ from BODY_16_xx
+#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
+	rev	$T1,$T1
+#endif
+#if $i==15
+	str	$inp,[sp,#17*4]			@ leave room for $t3
+#endif
 	eor	$t0,$t0,$e,ror#$Sigma1[2]	@ Sigma1(e)
 	and	$t1,$t1,$e
+	str	$T1,[sp,#`$i%16`*4]
 	add	$T1,$T1,$t0
 	eor	$t1,$t1,$g			@ Ch(e,f,g)
 	add	$T1,$T1,$h
@@ -71,6 +87,9 @@ $code.=<<___;
 	eor	$h,$h,$a,ror#$Sigma0[1]
 	add	$T1,$T1,$t2
 	eor	$h,$h,$a,ror#$Sigma0[2]		@ Sigma0(a)
+#if $i>=15
+	ldr	$t3,[sp,#`($i+2)%16`*4]		@ from BODY_16_xx
+#endif
 	orr	$t0,$a,$b
 	and	$t1,$a,$b
 	and	$t0,$t0,$c
@@ -85,24 +104,26 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
+	@ ldr	$t3,[sp,#`($i+1)%16`*4]		@ $i
 	ldr	$t2,[sp,#`($i+14)%16`*4]
+	mov	$t0,$t3,ror#$sigma0[0]
 	ldr	$T1,[sp,#`($i+0)%16`*4]
-	mov	$t0,$t1,ror#$sigma0[0]
-	ldr	$inp,[sp,#`($i+9)%16`*4]
-	eor	$t0,$t0,$t1,ror#$sigma0[1]
-	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
-	mov	$t1,$t2,ror#$sigma1[0]
+	eor	$t0,$t0,$t3,ror#$sigma0[1]
+	ldr	$t1,[sp,#`($i+9)%16`*4]
+	eor	$t0,$t0,$t3,lsr#$sigma0[2]	@ sigma0(X[i+1])
+	mov	$t3,$t2,ror#$sigma1[0]
 	add	$T1,$T1,$t0
-	eor	$t1,$t1,$t2,ror#$sigma1[1]
-	add	$T1,$T1,$inp
-	eor	$t1,$t1,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	eor	$t3,$t3,$t2,ror#$sigma1[1]
 	add	$T1,$T1,$t1
+	eor	$t3,$t3,$t2,lsr#$sigma1[2]	@ sigma1(X[i+14])
+	@ add	$T1,$T1,$t3
 ___
 	&BODY_00_15(@_);
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 .code	32
 
@@ -132,7 +153,7 @@ K256:
 sha256_block_data_order:
 	sub	r3,pc,#8		@ sha256_block_data_order
 	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
-	stmdb	sp!,{$ctx,$inp,$len,r4-r12,lr}
+	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
 	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 	sub	$Ktbl,r3,#256		@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
@@ -171,10 +192,14 @@ $code.=<<___;
 	bne	.Loop
 
 	add	sp,sp,#`16+3`*4	@ destroy frame
-	ldmia	sp!,{r4-r12,lr}
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r11,pc}
+#else
+	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
 .size   sha256_block_data_order,.-sha256_block_data_order
 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
index 3a35861ac6..7faf37b147 100644
--- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl
@@ -18,22 +18,33 @@
 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
 # Cortex A8 core and ~40 cycles per processed byte.
 
+# February 2011.
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Coxtex A8 core and ~38 cycles per byte.
+
+# March 2011.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process
+# one byte in 25.5 cycles or 47% faster than integer-only code.
+
 # Byte order [in]dependence. =========================================
 #
-# Caller is expected to maintain specific *dword* order in h[0-7],
-# namely with most significant dword at *lower* address, which is
-# reflected in below two parameters. *Byte* order within these dwords
-# in turn is whatever *native* byte order on current platform.
-$hi=0;
-$lo=4;
+# Originally caller was expected to maintain specific *dword* order in
+# h[0-7], namely with most significant dword at *lower* address, which
+# was reflected in below two parameters as 0 and 4. Now caller is
+# expected to maintain native byte order for whole 64-bit values.
+$hi="HI";
+$lo="LO";
 # ====================================================================
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
-$ctx="r0";
+$ctx="r0";	# parameter block
 $inp="r1";
 $len="r2";
+
 $Tlo="r3";
 $Thi="r4";
 $Alo="r5";
@@ -61,15 +72,17 @@ $Xoff=8*8;
 sub BODY_00_15() {
 my $magic = shift;
 $code.=<<___;
-	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
-	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	$t0,$Elo,lsr#14
+	str	$Tlo,[sp,#$Xoff+0]
 	mov	$t1,$Ehi,lsr#14
+	str	$Thi,[sp,#$Xoff+4]
 	eor	$t0,$t0,$Ehi,lsl#18
+	ldr	$t2,[sp,#$Hoff+0]	@ h.lo
 	eor	$t1,$t1,$Elo,lsl#18
+	ldr	$t3,[sp,#$Hoff+4]	@ h.hi
 	eor	$t0,$t0,$Elo,lsr#18
 	eor	$t1,$t1,$Ehi,lsr#18
 	eor	$t0,$t0,$Ehi,lsl#14
@@ -96,25 +109,24 @@ $code.=<<___;
 	and	$t1,$t1,$Ehi
 	str	$Ahi,[sp,#$Aoff+4]
 	eor	$t0,$t0,$t2
-	ldr	$t2,[$Ktbl,#4]		@ K[i].lo
+	ldr	$t2,[$Ktbl,#$lo]	@ K[i].lo
 	eor	$t1,$t1,$t3		@ Ch(e,f,g)
-	ldr	$t3,[$Ktbl,#0]		@ K[i].hi
+	ldr	$t3,[$Ktbl,#$hi]	@ K[i].hi
 
 	adds	$Tlo,$Tlo,$t0
 	ldr	$Elo,[sp,#$Doff+0]	@ d.lo
 	adc	$Thi,$Thi,$t1		@ T += Ch(e,f,g)
 	ldr	$Ehi,[sp,#$Doff+4]	@ d.hi
 	adds	$Tlo,$Tlo,$t2
+	and	$t0,$t2,#0xff
 	adc	$Thi,$Thi,$t3		@ T += K[i]
 	adds	$Elo,$Elo,$Tlo
+	ldr	$t2,[sp,#$Boff+0]	@ b.lo
 	adc	$Ehi,$Ehi,$Thi		@ d += T
-
-	and	$t0,$t2,#0xff
 	teq	$t0,#$magic
-	orreq	$Ktbl,$Ktbl,#1
 
-	ldr	$t2,[sp,#$Boff+0]	@ b.lo
 	ldr	$t3,[sp,#$Coff+0]	@ c.lo
+	orreq	$Ktbl,$Ktbl,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@@ -131,80 +143,100 @@ $code.=<<___;
 	eor	$t0,$t0,$Alo,lsl#25
 	eor	$t1,$t1,$Ahi,lsl#25	@ Sigma0(a)
 	adds	$Tlo,$Tlo,$t0
+	and	$t0,$Alo,$t2
 	adc	$Thi,$Thi,$t1		@ T += Sigma0(a)
 
-	and	$t0,$Alo,$t2
-	orr	$Alo,$Alo,$t2
 	ldr	$t1,[sp,#$Boff+4]	@ b.hi
+	orr	$Alo,$Alo,$t2
 	ldr	$t2,[sp,#$Coff+4]	@ c.hi
 	and	$Alo,$Alo,$t3
-	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$t3,$Ahi,$t1
 	orr	$Ahi,$Ahi,$t1
+	orr	$Alo,$Alo,$t0		@ Maj(a,b,c).lo
 	and	$Ahi,$Ahi,$t2
-	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	adds	$Alo,$Alo,$Tlo
-	adc	$Ahi,$Ahi,$Thi		@ h += T
-
+	orr	$Ahi,$Ahi,$t3		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
+	adc	$Ahi,$Ahi,$Thi		@ h += T
+	tst	$Ktbl,#1
 	add	$Ktbl,$Ktbl,#8
 ___
 }
 $code=<<___;
+#include "arm_arch.h"
+#ifdef __ARMEL__
+# define LO 0
+# define HI 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
+#else
+# define HI 0
+# define LO 4
+# define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
+#endif
+
 .text
 .code	32
 .type	K512,%object
 .align	5
 K512:
-.word	0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
-.word	0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
-.word	0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
-.word	0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
-.word	0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
-.word	0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
-.word	0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
-.word	0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
-.word	0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
-.word	0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
-.word	0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
-.word	0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
-.word	0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
-.word	0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
-.word	0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
-.word	0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
-.word	0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
-.word	0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
-.word	0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
-.word	0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
-.word	0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
-.word	0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
-.word	0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
-.word	0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
-.word	0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
-.word	0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
-.word	0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
-.word	0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
-.word	0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
-.word	0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
-.word	0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
-.word	0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
-.word	0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
-.word	0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
-.word	0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
-.word	0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
-.word	0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
-.word	0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
-.word	0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
-.word	0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
+WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
+WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
+WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
+WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
+WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
+WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
+WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
+WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
+WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
+WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
+WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
+WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
+WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
+WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
+WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
+WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
+WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
+WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
+WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
+WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
+WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
+WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
+WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
+WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
+WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
+WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
+WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
+WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
+WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
+WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
+WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
+WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
+WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
+WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
+WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
+WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
+WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
+WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
+WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .size	K512,.-K512
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-sha512_block_data_order
+.skip	32-4
 
 .global	sha512_block_data_order
 .type	sha512_block_data_order,%function
 sha512_block_data_order:
 	sub	r3,pc,#8		@ sha512_block_data_order
 	add	$len,$inp,$len,lsl#7	@ len to point at the end of inp
+#if __ARM_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
+	tst	r12,#1
+	bne	.LNEON
+#endif
 	stmdb	sp!,{r4-r12,lr}
-	sub	$Ktbl,r3,#640		@ K512
+	sub	$Ktbl,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
 	ldr	$Elo,[$ctx,#$Eoff+$lo]
@@ -238,6 +270,7 @@ sha512_block_data_order:
 	str	$Thi,[sp,#$Foff+4]
 
 .L00_15:
+#if __ARM_ARCH__<7
 	ldrb	$Tlo,[$inp,#7]
 	ldrb	$t0, [$inp,#6]
 	ldrb	$t1, [$inp,#5]
@@ -252,26 +285,30 @@ sha512_block_data_order:
 	orr	$Thi,$Thi,$t3,lsl#8
 	orr	$Thi,$Thi,$t0,lsl#16
 	orr	$Thi,$Thi,$t1,lsl#24
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
+#else
+	ldr	$Tlo,[$inp,#4]
+	ldr	$Thi,[$inp],#8
+#ifdef __ARMEL__
+	rev	$Tlo,$Tlo
+	rev	$Thi,$Thi
+#endif
+#endif
 ___
 	&BODY_00_15(0x94);
 $code.=<<___;
 	tst	$Ktbl,#1
 	beq	.L00_15
-	bic	$Ktbl,$Ktbl,#1
-
-.L16_79:
 	ldr	$t0,[sp,#`$Xoff+8*(16-1)`+0]
 	ldr	$t1,[sp,#`$Xoff+8*(16-1)`+4]
-	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
-	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
-
+	bic	$Ktbl,$Ktbl,#1
+.L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	$Tlo,$t0,lsr#1
+	ldr	$t2,[sp,#`$Xoff+8*(16-14)`+0]
 	mov	$Thi,$t1,lsr#1
+	ldr	$t3,[sp,#`$Xoff+8*(16-14)`+4]
 	eor	$Tlo,$Tlo,$t1,lsl#31
 	eor	$Thi,$Thi,$t0,lsl#31
 	eor	$Tlo,$Tlo,$t0,lsr#8
@@ -295,25 +332,24 @@ $code.=<<___;
 	eor	$t1,$t1,$t3,lsl#3
 	eor	$t0,$t0,$t2,lsr#6
 	eor	$t1,$t1,$t3,lsr#6
+	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	eor	$t0,$t0,$t3,lsl#26
 
-	ldr	$t2,[sp,#`$Xoff+8*(16-9)`+0]
 	ldr	$t3,[sp,#`$Xoff+8*(16-9)`+4]
 	adds	$Tlo,$Tlo,$t0
+	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	adc	$Thi,$Thi,$t1
 
-	ldr	$t0,[sp,#`$Xoff+8*16`+0]
 	ldr	$t1,[sp,#`$Xoff+8*16`+4]
 	adds	$Tlo,$Tlo,$t2
 	adc	$Thi,$Thi,$t3
 	adds	$Tlo,$Tlo,$t0
 	adc	$Thi,$Thi,$t1
-	str	$Tlo,[sp,#$Xoff+0]
-	str	$Thi,[sp,#$Xoff+4]
 ___
 	&BODY_00_15(0x17);
 $code.=<<___;
-	tst	$Ktbl,#1
+	ldreq	$t0,[sp,#`$Xoff+8*(16-1)`+0]
+	ldreq	$t1,[sp,#`$Xoff+8*(16-1)`+4]
 	beq	.L16_79
 	bic	$Ktbl,$Ktbl,#1
 
@@ -324,12 +360,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Boff+$lo]
 	ldr	$t3, [$ctx,#$Boff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Aoff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Aoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Boff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Boff+$hi]
 
 	ldr	$Alo,[sp,#$Coff+0]
@@ -341,12 +377,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Doff+$lo]
 	ldr	$t3, [$ctx,#$Doff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Coff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Coff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Doff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Doff+$hi]
 
 	ldr	$Tlo,[sp,#$Foff+0]
@@ -356,12 +392,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Foff+$lo]
 	ldr	$t3, [$ctx,#$Foff+$hi]
 	adds	$Elo,$Elo,$t0
-	adc	$Ehi,$Ehi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$Elo,[$ctx,#$Eoff+$lo]
+	adc	$Ehi,$Ehi,$t1
 	str	$Ehi,[$ctx,#$Eoff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Foff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Foff+$hi]
 
 	ldr	$Alo,[sp,#$Goff+0]
@@ -373,12 +409,12 @@ $code.=<<___;
 	ldr	$t2, [$ctx,#$Hoff+$lo]
 	ldr	$t3, [$ctx,#$Hoff+$hi]
 	adds	$t0,$Alo,$t0
-	adc	$t1,$Ahi,$t1
-	adds	$t2,$Tlo,$t2
-	adc	$t3,$Thi,$t3
 	str	$t0, [$ctx,#$Goff+$lo]
+	adc	$t1,$Ahi,$t1
 	str	$t1, [$ctx,#$Goff+$hi]
+	adds	$t2,$Tlo,$t2
 	str	$t2, [$ctx,#$Hoff+$lo]
+	adc	$t3,$Thi,$t3
 	str	$t3, [$ctx,#$Hoff+$hi]
 
 	add	sp,sp,#640
@@ -388,13 +424,156 @@ $code.=<<___;
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r12,pc}
+#else
 	ldmia	sp!,{r4-r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	bx	lr			@ interoperable with Thumb ISA:-)
-.size   sha512_block_data_order,.-sha512_block_data_order
-.asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+#endif
+___
+
+{
+my @Sigma0=(28,34,39);
+my @Sigma1=(14,18,41);
+my @sigma0=(1, 8, 7);
+my @sigma1=(19,61,6);
+
+my $Ktbl="r3";
+my $cnt="r12";	# volatile register known as ip, intra-procedure-call scratch
+
+my @X=map("d$_",(0..15));
+my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
+
+sub NEON_00_15() {
+my $i=shift;
+my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));	# temps
+
+$code.=<<___ if ($i<16 || $i&1);
+	vshr.u64	$t0,$e,#@Sigma1[0]	@ $i
+#if $i<16
+	vld1.64		{@X[$i%16]},[$inp]!	@ handles unaligned
+#endif
+	vshr.u64	$t1,$e,#@Sigma1[1]
+	vshr.u64	$t2,$e,#@Sigma1[2]
+___
+$code.=<<___;
+	vld1.64		{$K},[$Ktbl,:64]!	@ K[i++]
+	vsli.64		$t0,$e,#`64-@Sigma1[0]`
+	vsli.64		$t1,$e,#`64-@Sigma1[1]`
+	vsli.64		$t2,$e,#`64-@Sigma1[2]`
+#if $i<16 && defined(__ARMEL__)
+	vrev64.8	@X[$i],@X[$i]
+#endif
+	vadd.i64	$T1,$K,$h
+	veor		$Ch,$f,$g
+	veor		$t0,$t1
+	vand		$Ch,$e
+	veor		$t0,$t2			@ Sigma1(e)
+	veor		$Ch,$g			@ Ch(e,f,g)
+	vadd.i64	$T1,$t0
+	vshr.u64	$t0,$a,#@Sigma0[0]
+	vadd.i64	$T1,$Ch
+	vshr.u64	$t1,$a,#@Sigma0[1]
+	vshr.u64	$t2,$a,#@Sigma0[2]
+	vsli.64		$t0,$a,#`64-@Sigma0[0]`
+	vsli.64		$t1,$a,#`64-@Sigma0[1]`
+	vsli.64		$t2,$a,#`64-@Sigma0[2]`
+	vadd.i64	$T1,@X[$i%16]
+	vorr		$Maj,$a,$c
+	vand		$Ch,$a,$c
+	veor		$h,$t0,$t1
+	vand		$Maj,$b
+	veor		$h,$t2			@ Sigma0(a)
+	vorr		$Maj,$Ch		@ Maj(a,b,c)
+	vadd.i64	$h,$T1
+	vadd.i64	$d,$T1
+	vadd.i64	$h,$Maj
+___
+}
+
+sub NEON_16_79() {
+my $i=shift;
+
+if ($i&1)	{ &NEON_00_15($i,@_); return; }
+
+# 2x-vectorized, therefore runs every 2nd round
+my @X=map("q$_",(0..7));			# view @X as 128-bit vector
+my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));	# temps
+my ($d0,$d1,$d2) = map("d$_",(24..26));		# temps from NEON_00_15
+my $e=@_[4];					# $e from NEON_00_15
+$i /= 2;
+$code.=<<___;
+	vshr.u64	$t0,@X[($i+7)%8],#@sigma1[0]
+	vshr.u64	$t1,@X[($i+7)%8],#@sigma1[1]
+	vshr.u64	$s1,@X[($i+7)%8],#@sigma1[2]
+	vsli.64		$t0,@X[($i+7)%8],#`64-@sigma1[0]`
+	vext.8		$s0,@X[$i%8],@X[($i+1)%8],#8	@ X[i+1]
+	vsli.64		$t1,@X[($i+7)%8],#`64-@sigma1[1]`
+	veor		$s1,$t0
+	vshr.u64	$t0,$s0,#@sigma0[0]
+	veor		$s1,$t1				@ sigma1(X[i+14])
+	vshr.u64	$t1,$s0,#@sigma0[1]
+	vadd.i64	@X[$i%8],$s1
+	vshr.u64	$s1,$s0,#@sigma0[2]
+	vsli.64		$t0,$s0,#`64-@sigma0[0]`
+	vsli.64		$t1,$s0,#`64-@sigma0[1]`
+	vext.8		$s0,@X[($i+4)%8],@X[($i+5)%8],#8	@ X[i+9]
+	veor		$s1,$t0
+	vshr.u64	$d0,$e,#@Sigma1[0]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s0
+	vshr.u64	$d1,$e,#@Sigma1[1]		@ from NEON_00_15
+	veor		$s1,$t1				@ sigma0(X[i+1])
+	vshr.u64	$d2,$e,#@Sigma1[2]		@ from NEON_00_15
+	vadd.i64	@X[$i%8],$s1
+___
+	&NEON_00_15(2*$i,@_);
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.align	4
+.LNEON:
+	dmb				@ errata #451034 on early Cortex A8
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	sub	$Ktbl,r3,#672		@ K512
+	vldmia	$ctx,{$A-$H}		@ load context
+.Loop_neon:
+___
+for($i=0;$i<16;$i++)	{ &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	mov		$cnt,#4
+.L16_79_neon:
+	subs		$cnt,#1
+___
+for(;$i<32;$i++)	{ &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bne		.L16_79_neon
+
+	vldmia		$ctx,{d24-d31}	@ load context to temp
+	vadd.i64	q8,q12		@ vectorized accumulate
+	vadd.i64	q9,q13
+	vadd.i64	q10,q14
+	vadd.i64	q11,q15
+	vstmia		$ctx,{$A-$H}	@ save context
+	teq		$inp,$len
+	sub		$Ktbl,#640	@ rewind K512
+	bne		.Loop_neon
+
+	vldmia	sp!,{d8-d15}		@ epilogue
+	bx	lr
+#endif
+___
+}
+$code.=<<___;
+.size	sha512_block_data_order,.-sha512_block_data_order
+.asciz	"SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align	2
+.comm	OPENSSL_armcap_P,4,4
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl
new file mode 100644
index 0000000000..ba5b250890
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-mips.pl
@@ -0,0 +1,455 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA2 block procedures for MIPS.
+
+# October 2010.
+#
+# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc-
+# generated code in o32 build and ~55% in n32/64 build. SHA512 [which
+# for now can only be compiled for MIPS64 ISA] improvement is modest
+# ~17%, but it comes for free, because it's same instruction sequence.
+# Improvement coefficients are for aligned input.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp [o32 can be
+#   excluded from the rule, because it's specified volatile];
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$PTR_SLL="dsll";	# incidentally works even on n32
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$PTR_SLL="sll";
+	$SZREG=4;
+}
+$pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0;
+
+for (@ARGV) {	$output=$_ if (/^\w[\w\-]*\.\w+$/);	}
+open STDOUT,">$output";
+
+if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); }
+
+if ($output =~ /512/) {
+	$label="512";
+	$SZ=8;
+	$LD="ld";		# load from memory
+	$ST="sd";		# store to memory
+	$SLL="dsll";		# shift left logical
+	$SRL="dsrl";		# shift right logical
+	$ADDU="daddu";
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=( 7, 1, 8);	# right shift first
+	@sigma1=( 6,19,61);	# right shift first
+	$lastK=0x817;
+	$rounds=80;
+} else {
+	$label="256";
+	$SZ=4;
+	$LD="lw";		# load from memory
+	$ST="sw";		# store to memory
+	$SLL="sll";		# shift left logical
+	$SRL="srl";		# shift right logical
+	$ADDU="addu";
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 3, 7,18);	# right shift first
+	@sigma1=(10,17,19);	# right shift first
+	$lastK=0x8f2;
+	$rounds=64;
+}
+
+$MSB = $big_endian ? 0 : ($SZ-1);
+$LSB = ($SZ-1)&~$MSB;
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31));
+@X=map("\$$_",(8..23));
+
+$ctx=$a0;
+$inp=$a1;
+$len=$a2;	$Ktbl=$len;
+
+sub BODY_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___ if ($i<15);
+	${LD}l	@X[1],`($i+1)*$SZ+$MSB`($inp)
+	${LD}r	@X[1],`($i+1)*$SZ+$LSB`($inp)
+___
+$code.=<<___	if (!$big_endian && $i<16 && $SZ==4);
+	srl	$tmp0,@X[0],24		# byte swap($i)
+	srl	$tmp1,@X[0],8
+	andi	$tmp2,@X[0],0xFF00
+	sll	@X[0],@X[0],24
+	andi	$tmp1,0xFF00
+	sll	$tmp2,$tmp2,8
+	or	@X[0],$tmp0
+	or	$tmp1,$tmp2
+	or	@X[0],$tmp1
+___
+$code.=<<___	if (!$big_endian && $i<16 && $SZ==8);
+	ori	$tmp0,$zero,0xFF
+	dsll	$tmp2,$tmp0,32
+	or	$tmp0,$tmp2		# 0x000000FF000000FF
+	and	$tmp1,@X[0],$tmp0	# byte swap($i)
+	dsrl	$tmp2,@X[0],24
+	dsll	$tmp1,24
+	and	$tmp2,$tmp0
+	dsll	$tmp0,8			# 0x0000FF000000FF00
+	or	$tmp1,$tmp2
+	and	$tmp2,@X[0],$tmp0
+	dsrl	@X[0],8
+	dsll	$tmp2,8
+	and	@X[0],$tmp0
+	or	$tmp1,$tmp2
+	or	@X[0],$tmp1
+	dsrl	$tmp1,@X[0],32
+	dsll	@X[0],32
+	or	@X[0],$tmp1
+___
+$code.=<<___;
+	$ADDU	$T1,$X[0],$h			# $i
+	$SRL	$h,$e,@Sigma1[0]
+	xor	$tmp2,$f,$g
+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[2]`
+	and	$tmp2,$e
+	$SRL	$tmp0,$e,@Sigma1[1]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[1]`
+	xor	$h,$tmp0
+	$SRL	$tmp0,$e,@Sigma1[2]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$e,`$SZ*8-@Sigma1[0]`
+	xor	$h,$tmp0
+	xor	$tmp2,$g			# Ch(e,f,g)
+	xor	$tmp0,$tmp1,$h			# Sigma1(e)
+
+	$SRL	$h,$a,@Sigma0[0]
+	$ADDU	$T1,$tmp2
+	$LD	$tmp2,`$i*$SZ`($Ktbl)		# K[$i]
+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[2]`
+	$ADDU	$T1,$tmp0
+	$SRL	$tmp0,$a,@Sigma0[1]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[1]`
+	xor	$h,$tmp0
+	$SRL	$tmp0,$a,@Sigma0[2]
+	xor	$h,$tmp1
+	$SLL	$tmp1,$a,`$SZ*8-@Sigma0[0]`
+	xor	$h,$tmp0
+	$ST	@X[0],`($i%16)*$SZ`($sp)	# offload to ring buffer
+	xor	$h,$tmp1			# Sigma0(a)
+
+	or	$tmp0,$a,$b
+	and	$tmp1,$a,$b
+	and	$tmp0,$c
+	or	$tmp1,$tmp0			# Maj(a,b,c)
+	$ADDU	$T1,$tmp2			# +=K[$i]
+	$ADDU	$h,$tmp1
+
+	$ADDU	$d,$T1
+	$ADDU	$h,$T1
+___
+$code.=<<___ if ($i>=13);
+	$LD	@X[3],`(($i+3)%16)*$SZ`($sp)	# prefetch from ring buffer
+___
+}
+
+sub BODY_16_XX {
+my $i=@_[0];
+my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]);
+
+$code.=<<___;
+	$SRL	$tmp2,@X[1],@sigma0[0]		# Xupdate($i)
+	$ADDU	@X[0],@X[9]			# +=X[i+9]
+	$SLL	$tmp1,@X[1],`$SZ*8-@sigma0[2]`
+	$SRL	$tmp0,@X[1],@sigma0[1]
+	xor	$tmp2,$tmp1
+	$SLL	$tmp1,`@sigma0[2]-@sigma0[1]`
+	xor	$tmp2,$tmp0
+	$SRL	$tmp0,@X[1],@sigma0[2]
+	xor	$tmp2,$tmp1
+
+	$SRL	$tmp3,@X[14],@sigma1[0]
+	xor	$tmp2,$tmp0			# sigma0(X[i+1])
+	$SLL	$tmp1,@X[14],`$SZ*8-@sigma1[2]`
+	$ADDU	@X[0],$tmp2
+	$SRL	$tmp0,@X[14],@sigma1[1]
+	xor	$tmp3,$tmp1
+	$SLL	$tmp1,`@sigma1[2]-@sigma1[1]`
+	xor	$tmp3,$tmp0
+	$SRL	$tmp0,@X[14],@sigma1[2]
+	xor	$tmp3,$tmp1
+
+	xor	$tmp3,$tmp0			# sigma1(X[i+14])
+	$ADDU	@X[0],$tmp3
+___
+	&BODY_00_15(@_);
+}
+
+$FRAMESIZE=16*$SZ+16*$SZREG;
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
+
+$code.=<<___;
+#ifdef OPENSSL_FIPSCANISTER
+# include <openssl/fipssyms.h>
+#endif
+
+.text
+.set	noat
+#if !defined(__vxworks) || defined(__pic__)
+.option	pic2
+#endif
+
+.align	5
+.globl	sha${label}_block_data_order
+.ent	sha${label}_block_data_order
+sha${label}_block_data_order:
+	.frame	$sp,$FRAMESIZE,$ra
+	.mask	$SAVED_REGS_MASK,-$SZREG
+	.set	noreorder
+___
+$code.=<<___ if ($flavour =~ /o32/i);	# o32 PIC-ification
+	.cpload	$pf
+___
+$code.=<<___;
+	$PTR_SUB $sp,$FRAMESIZE
+	$REG_S	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_S	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_S	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_S	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_S	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_S	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_S	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_S	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_S	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_S	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
+	$REG_S	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_S	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_S	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_S	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_S	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	$PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)`
+___
+$code.=<<___ if ($flavour !~ /o32/i);	# non-o32 PIC-ification
+	.cplocal	$Ktbl
+	.cpsetup	$pf,$zero,sha${label}_block_data_order
+___
+$code.=<<___;
+	.set	reorder
+	la	$Ktbl,K${label}		# PIC-ified 'load address'
+
+	$LD	$A,0*$SZ($ctx)		# load context
+	$LD	$B,1*$SZ($ctx)
+	$LD	$C,2*$SZ($ctx)
+	$LD	$D,3*$SZ($ctx)
+	$LD	$E,4*$SZ($ctx)
+	$LD	$F,5*$SZ($ctx)
+	$LD	$G,6*$SZ($ctx)
+	$LD	$H,7*$SZ($ctx)
+
+	$PTR_ADD @X[15],$inp		# pointer to the end of input
+	$REG_S	@X[15],16*$SZ($sp)
+	b	.Loop
+
+.align	5
+.Loop:
+	${LD}l	@X[0],$MSB($inp)
+	${LD}r	@X[0],$LSB($inp)
+___
+for ($i=0;$i<16;$i++)
+{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+	b	.L16_xx
+.align	4
+.L16_xx:
+___
+for (;$i<32;$i++)
+{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); }
+$code.=<<___;
+	and	@X[6],0xfff
+	li	@X[7],$lastK
+	.set	noreorder
+	bne	@X[6],@X[7],.L16_xx
+	$PTR_ADD $Ktbl,16*$SZ		# Ktbl+=16
+
+	$REG_L	@X[15],16*$SZ($sp)	# restore pointer to the end of input
+	$LD	@X[0],0*$SZ($ctx)
+	$LD	@X[1],1*$SZ($ctx)
+	$LD	@X[2],2*$SZ($ctx)
+	$PTR_ADD $inp,16*$SZ
+	$LD	@X[3],3*$SZ($ctx)
+	$ADDU	$A,@X[0]
+	$LD	@X[4],4*$SZ($ctx)
+	$ADDU	$B,@X[1]
+	$LD	@X[5],5*$SZ($ctx)
+	$ADDU	$C,@X[2]
+	$LD	@X[6],6*$SZ($ctx)
+	$ADDU	$D,@X[3]
+	$LD	@X[7],7*$SZ($ctx)
+	$ADDU	$E,@X[4]
+	$ST	$A,0*$SZ($ctx)
+	$ADDU	$F,@X[5]
+	$ST	$B,1*$SZ($ctx)
+	$ADDU	$G,@X[6]
+	$ST	$C,2*$SZ($ctx)
+	$ADDU	$H,@X[7]
+	$ST	$D,3*$SZ($ctx)
+	$ST	$E,4*$SZ($ctx)
+	$ST	$F,5*$SZ($ctx)
+	$ST	$G,6*$SZ($ctx)
+	$ST	$H,7*$SZ($ctx)
+
+	bnel	$inp,@X[15],.Loop
+	$PTR_SUB $Ktbl,`($rounds-16)*$SZ`	# rewind $Ktbl
+
+	$REG_L	$ra,$FRAMESIZE-1*$SZREG($sp)
+	$REG_L	$fp,$FRAMESIZE-2*$SZREG($sp)
+	$REG_L	$s11,$FRAMESIZE-3*$SZREG($sp)
+	$REG_L	$s10,$FRAMESIZE-4*$SZREG($sp)
+	$REG_L	$s9,$FRAMESIZE-5*$SZREG($sp)
+	$REG_L	$s8,$FRAMESIZE-6*$SZREG($sp)
+	$REG_L	$s7,$FRAMESIZE-7*$SZREG($sp)
+	$REG_L	$s6,$FRAMESIZE-8*$SZREG($sp)
+	$REG_L	$s5,$FRAMESIZE-9*$SZREG($sp)
+	$REG_L	$s4,$FRAMESIZE-10*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,$FRAMESIZE-11*$SZREG($sp)
+	$REG_L	$s2,$FRAMESIZE-12*$SZREG($sp)
+	$REG_L	$s1,$FRAMESIZE-13*$SZREG($sp)
+	$REG_L	$s0,$FRAMESIZE-14*$SZREG($sp)
+	$REG_L	$gp,$FRAMESIZE-15*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE
+.end	sha${label}_block_data_order
+
+.rdata
+.align	5
+K${label}:
+___
+if ($SZ==4) {
+$code.=<<___;
+	.word	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+	.word	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+	.word	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+	.word	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+	.word	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+	.word	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+	.word	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+	.word	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+	.word	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+	.word	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+	.word	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+	.word	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+	.word	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+	.word	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+	.word	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+	.word	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+___
+} else {
+$code.=<<___;
+	.dword	0x428a2f98d728ae22, 0x7137449123ef65cd
+	.dword	0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+	.dword	0x3956c25bf348b538, 0x59f111f1b605d019
+	.dword	0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+	.dword	0xd807aa98a3030242, 0x12835b0145706fbe
+	.dword	0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+	.dword	0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+	.dword	0x9bdc06a725c71235, 0xc19bf174cf692694
+	.dword	0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+	.dword	0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+	.dword	0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+	.dword	0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+	.dword	0x983e5152ee66dfab, 0xa831c66d2db43210
+	.dword	0xb00327c898fb213f, 0xbf597fc7beef0ee4
+	.dword	0xc6e00bf33da88fc2, 0xd5a79147930aa725
+	.dword	0x06ca6351e003826f, 0x142929670a0e6e70
+	.dword	0x27b70a8546d22ffc, 0x2e1b21385c26c926
+	.dword	0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+	.dword	0x650a73548baf63de, 0x766a0abb3c77b2a8
+	.dword	0x81c2c92e47edaee6, 0x92722c851482353b
+	.dword	0xa2bfe8a14cf10364, 0xa81a664bbc423001
+	.dword	0xc24b8b70d0f89791, 0xc76c51a30654be30
+	.dword	0xd192e819d6ef5218, 0xd69906245565a910
+	.dword	0xf40e35855771202a, 0x106aa07032bbd1b8
+	.dword	0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+	.dword	0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+	.dword	0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+	.dword	0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+	.dword	0x748f82ee5defb2fc, 0x78a5636f43172f60
+	.dword	0x84c87814a1f0ab72, 0x8cc702081a6439ec
+	.dword	0x90befffa23631e28, 0xa4506cebde82bde9
+	.dword	0xbef9a3f7b2c67915, 0xc67178f2e372532b
+	.dword	0xca273eceea26619c, 0xd186b8c721c0c207
+	.dword	0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+	.dword	0x06f067aa72176fba, 0x0a637dc5a2c898a6
+	.dword	0x113f9804bef90dae, 0x1b710b35131c471b
+	.dword	0x28db77f523047d84, 0x32caab7b40c72493
+	.dword	0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+	.dword	0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+	.dword	0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+___
+}
+$code.=<<___;
+.asciiz	"SHA${label} for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
new file mode 100755
index 0000000000..e24ee58ae9
--- /dev/null
+++ b/src/lib/libcrypto/sha/asm/sha512-parisc.pl
@@ -0,0 +1,791 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA256/512 block procedure for PA-RISC.
+
+# June 2009.
+#
+# SHA256 performance is >75% better than gcc 3.2 generated code on
+# PA-7100LC. Compared to code generated by vendor compiler this
+# implementation is almost 70% faster in 64-bit build, but delivers
+# virtually same performance in 32-bit build on PA-8600.
+#
+# SHA512 performance is >2.9x better than gcc 3.2 generated code on
+# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
+# code is executed on PA-RISC 2.0 processor and switches to 64-bit
+# code path delivering adequate peformance even in "blended" 32-bit
+# build. Though 64-bit code is not any faster than code generated by
+# vendor compiler on PA-8600...
+#
+# Special thanks to polarhome.com for providing HP-UX account.
+
+$flavour = shift;
+$output = shift;
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+} else {
+	$LEVEL		="1.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+}
+
+if ($output =~ /512/) {
+	$func="sha512_block_data_order";
+	$SZ=8;
+	@Sigma0=(28,34,39);
+	@Sigma1=(14,18,41);
+	@sigma0=(1,  8, 7);
+	@sigma1=(19,61, 6);
+	$rounds=80;
+	$LAST10BITS=0x017;
+	$LD="ldd";
+	$LDM="ldd,ma";
+	$ST="std";
+} else {
+	$func="sha256_block_data_order";
+	$SZ=4;
+	@Sigma0=( 2,13,22);
+	@Sigma1=( 6,11,25);
+	@sigma0=( 7,18, 3);
+	@sigma1=(17,19,10);
+	$rounds=64;
+	$LAST10BITS=0x0f2;
+	$LD="ldw";
+	$LDM="ldwm";
+	$ST="stw";
+}
+
+$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
+				#                 [+ argument transfer]
+$XOFF=16*$SZ+32;		# local variables
+$FRAME+=$XOFF;
+$XOFF+=$FRAME_MARKER;		# distance between %sp and local variables
+
+$ctx="%r26";	# zapped by $a0
+$inp="%r25";	# zapped by $a1
+$num="%r24";	# zapped by $t0
+
+$a0 ="%r26";
+$a1 ="%r25";
+$t0 ="%r24";
+$t1 ="%r29";
+$Tbl="%r31";
+
+@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
+
+@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+    "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
+
+sub ROUND_00_15 {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$code.=<<___;
+	_ror	$e,$Sigma1[0],$a0
+	and	$f,$e,$t0
+	_ror	$e,$Sigma1[1],$a1
+	addl	$t1,$h,$h
+	andcm	$g,$e,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma1[2]-$Sigma1[1]`,$a1
+	or	$t0,$t1,$t1		; Ch(e,f,g)
+	addl	@X[$i%16],$h,$h
+	xor	$a0,$a1,$a1		; Sigma1(e)
+	addl	$t1,$h,$h
+	_ror	$a,$Sigma0[0],$a0
+	addl	$a1,$h,$h
+
+	_ror	$a,$Sigma0[1],$a1
+	and	$a,$b,$t0
+	and	$a,$c,$t1
+	xor	$a1,$a0,$a0
+	_ror	$a1,`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$t1,$t0,$t0
+	and	$b,$c,$t1
+	xor	$a0,$a1,$a1		; Sigma0(a)
+	addl	$h,$d,$d
+	xor	$t1,$t0,$t0		; Maj(a,b,c)
+	`"$LDM	$SZ($Tbl),$t1" if ($i<15)`
+	addl	$a1,$h,$h
+	addl	$t0,$h,$h
+
+___
+}
+
+sub ROUND_16_xx {
+my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
+$i-=16;
+$code.=<<___;
+	_ror	@X[($i+1)%16],$sigma0[0],$a0
+	_ror	@X[($i+1)%16],$sigma0[1],$a1
+	addl	@X[($i+9)%16],@X[$i],@X[$i]
+	_ror	@X[($i+14)%16],$sigma1[0],$t0
+	_ror	@X[($i+14)%16],$sigma1[1],$t1
+	xor	$a1,$a0,$a0
+	_shr	@X[($i+1)%16],$sigma0[2],$a1
+	xor	$t1,$t0,$t0
+	_shr	@X[($i+14)%16],$sigma1[2],$t1
+	xor	$a1,$a0,$a0		; sigma0(X[(i+1)&0x0f])
+	xor	$t1,$t0,$t0		; sigma1(X[(i+14)&0x0f])
+	$LDM	$SZ($Tbl),$t1
+	addl	$a0,@X[$i],@X[$i]
+	addl	$t0,@X[$i],@X[$i]
+___
+$code.=<<___ if ($i==15);
+	extru	$t1,31,10,$a1
+	comiclr,<> $LAST10BITS,$a1,%r0
+	ldo	1($Tbl),$Tbl		; signal end of $Tbl
+___
+&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
+}
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.ALIGN	64
+L\$table
+___
+$code.=<<___ if ($SZ==8);
+	.WORD	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
+	.WORD	0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
+	.WORD	0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
+	.WORD	0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
+	.WORD	0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
+	.WORD	0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
+	.WORD	0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
+	.WORD	0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
+	.WORD	0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
+	.WORD	0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
+	.WORD	0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
+	.WORD	0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
+	.WORD	0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
+	.WORD	0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
+	.WORD	0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
+	.WORD	0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
+	.WORD	0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
+	.WORD	0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
+	.WORD	0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
+	.WORD	0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
+	.WORD	0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
+	.WORD	0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
+	.WORD	0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
+	.WORD	0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
+	.WORD	0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
+	.WORD	0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
+	.WORD	0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
+	.WORD	0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
+	.WORD	0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
+	.WORD	0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
+	.WORD	0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
+	.WORD	0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
+	.WORD	0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
+	.WORD	0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
+	.WORD	0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
+	.WORD	0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
+	.WORD	0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
+	.WORD	0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
+	.WORD	0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
+	.WORD	0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
+___
+$code.=<<___ if ($SZ==4);
+	.WORD	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.WORD	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.WORD	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.WORD	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.WORD	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.WORD	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.WORD	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.WORD	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.WORD	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.WORD	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.WORD	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.WORD	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.WORD	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.WORD	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.WORD	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.WORD	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+___
+$code.=<<___;
+
+	.EXPORT	$func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
+	.ALIGN	64
+$func
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)	; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	$PUSH	%r11,`-$FRAME+8*$SIZE_T`(%sp)
+	$PUSH	%r12,`-$FRAME+9*$SIZE_T`(%sp)
+	$PUSH	%r13,`-$FRAME+10*$SIZE_T`(%sp)
+	$PUSH	%r14,`-$FRAME+11*$SIZE_T`(%sp)
+	$PUSH	%r15,`-$FRAME+12*$SIZE_T`(%sp)
+	$PUSH	%r16,`-$FRAME+13*$SIZE_T`(%sp)
+	$PUSH	%r17,`-$FRAME+14*$SIZE_T`(%sp)
+	$PUSH	%r18,`-$FRAME+15*$SIZE_T`(%sp)
+
+	_shl	$num,`log(16*$SZ)/log(2)`,$num
+	addl	$inp,$num,$num		; $num to point at the end of $inp
+
+	$PUSH	$num,`-$FRAME_MARKER-4*$SIZE_T`(%sp)	; save arguments
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
+	$PUSH	$ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
+
+	blr	%r0,$Tbl
+	ldi	3,$t1
+L\$pic
+	andcm	$Tbl,$t1,$Tbl		; wipe privilege level
+	ldo	L\$table-L\$pic($Tbl),$Tbl
+___
+$code.=<<___ if ($SZ==8 && $SIZE_T==4);
+	ldi	31,$t1
+	mtctl	$t1,%cr11
+	extrd,u,*= $t1,%sar,1,$t1	; executes on PA-RISC 1.0
+	b	L\$parisc1
+	nop
+___
+$code.=<<___;
+	$LD	`0*$SZ`($ctx),$A	; load context
+	$LD	`1*$SZ`($ctx),$B
+	$LD	`2*$SZ`($ctx),$C
+	$LD	`3*$SZ`($ctx),$D
+	$LD	`4*$SZ`($ctx),$E
+	$LD	`5*$SZ`($ctx),$F
+	$LD	`6*$SZ`($ctx),$G
+	$LD	`7*$SZ`($ctx),$H
+
+	extru	$inp,31,`log($SZ)/log(2)`,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	`8*$SZ`,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop
+	ldi	`$SZ-1`,$t0
+	$LDM	$SZ($Tbl),$t1
+	andcm	$inp,$t0,$t0		; align $inp
+___
+	for ($i=0;$i<15;$i++) {		# load input block
+	$code.="\t$LD	`$SZ*$i`($t0),@X[$i]\n";		}
+$code.=<<___;
+	cmpb,*=	$inp,$t0,L\$aligned
+	$LD	`$SZ*15`($t0),@X[15]
+	$LD	`$SZ*16`($t0),@X[16]
+___
+	for ($i=0;$i<16;$i++) {		# align data
+	$code.="\t_align	@X[$i],@X[$i+1],@X[$i]\n";	}
+$code.=<<___;
+L\$aligned
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+L\$rounds
+	nop	; otherwise /usr/ccs/bin/as is confused by below .WORD
+___
+for(;$i<32;$i++)	{ &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
+$code.=<<___;
+	bb,>=	$Tbl,31,L\$rounds	; end of $Tbl signalled?
+	nop
+
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ-1`($Tbl),$Tbl		; rewind $Tbl
+
+	$LD	`0*$SZ`($ctx),@X[0]	; load context
+	$LD	`1*$SZ`($ctx),@X[1]
+	$LD	`2*$SZ`($ctx),@X[2]
+	$LD	`3*$SZ`($ctx),@X[3]
+	$LD	`4*$SZ`($ctx),@X[4]
+	$LD	`5*$SZ`($ctx),@X[5]
+	addl	@X[0],$A,$A
+	$LD	`6*$SZ`($ctx),@X[6]
+	addl	@X[1],$B,$B
+	$LD	`7*$SZ`($ctx),@X[7]
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+
+	$ST	$A,`0*$SZ`($ctx)	; save context
+	addl	@X[2],$C,$C
+	$ST	$B,`1*$SZ`($ctx)
+	addl	@X[3],$D,$D
+	$ST	$C,`2*$SZ`($ctx)
+	addl	@X[4],$E,$E
+	$ST	$D,`3*$SZ`($ctx)
+	addl	@X[5],$F,$F
+	$ST	$E,`4*$SZ`($ctx)
+	addl	@X[6],$G,$G
+	$ST	$F,`5*$SZ`($ctx)
+	addl	@X[7],$H,$H
+	$ST	$G,`6*$SZ`($ctx)
+	$ST	$H,`7*$SZ`($ctx)
+
+	cmpb,*<>,n $inp,$num,L\$oop
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+___
+if ($SZ==8 && $SIZE_T==4)	# SHA512 for 32-bit PA-RISC 1.0
+{{
+$code.=<<___;
+	b	L\$done
+	nop
+
+	.ALIGN	64
+L\$parisc1
+___
+
+@V=(  $Ahi,  $Alo,  $Bhi,  $Blo,  $Chi,  $Clo,  $Dhi,  $Dlo,
+      $Ehi,  $Elo,  $Fhi,  $Flo,  $Ghi,  $Glo,  $Hhi,  $Hlo) = 
+   ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
+     "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
+$a0 ="%r17";
+$a1 ="%r18";
+$a2 ="%r19";
+$a3 ="%r20";
+$t0 ="%r21";
+$t1 ="%r22";
+$t2 ="%r28";
+$t3 ="%r29";
+$Tbl="%r31";
+
+@X=("%r23","%r24","%r25","%r26");	# zaps $num,$inp,$ctx
+
+sub ROUND_00_15_pa1 {
+my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
+       $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+
+$code.=<<___ if (!$flag);
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+___
+$code.=<<___;
+	shd	$ehi,$elo,$Sigma1[0],$t0
+	 add	$Xlo,$hlo,$hlo
+	shd	$elo,$ehi,$Sigma1[0],$t1
+	 addc	$Xhi,$hhi,$hhi		; h += X[i]
+	shd	$ehi,$elo,$Sigma1[1],$t2
+	 ldwm	8($Tbl),$Xhi
+	shd	$elo,$ehi,$Sigma1[1],$t3
+	 ldw	-4($Tbl),$Xlo		; load K[i]
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	 and	$flo,$elo,$a0
+	 and	$fhi,$ehi,$a1
+	shd	$ehi,$elo,$Sigma1[2],$t2
+	 andcm	$glo,$elo,$a2
+	shd	$elo,$ehi,$Sigma1[2],$t3
+	 andcm	$ghi,$ehi,$a3
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma1(e)
+	add	$Xlo,$hlo,$hlo
+	 xor	$a2,$a0,$a0
+	addc	$Xhi,$hhi,$hhi		; h += K[i]
+	 xor	$a3,$a1,$a1		; Ch(e,f,g)
+
+	 add	$t0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[0],$t0
+	 addc	$t1,$hhi,$hhi		; h += Sigma1(e)
+	shd	$alo,$ahi,$Sigma0[0],$t1	
+	 add	$a0,$hlo,$hlo
+	shd	$ahi,$alo,$Sigma0[1],$t2
+	 addc	$a1,$hhi,$hhi		; h += Ch(e,f,g)
+	shd	$alo,$ahi,$Sigma0[1],$t3
+
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1
+	shd	$ahi,$alo,$Sigma0[2],$t2
+	and	$alo,$blo,$a0
+	shd	$alo,$ahi,$Sigma0[2],$t3
+	and	$ahi,$bhi,$a1
+	xor	$t2,$t0,$t0
+	xor	$t3,$t1,$t1		; Sigma0(a)
+
+	and	$alo,$clo,$a2
+	and	$ahi,$chi,$a3
+	xor	$a2,$a0,$a0
+	 add	$hlo,$dlo,$dlo
+	xor	$a3,$a1,$a1
+	 addc	$hhi,$dhi,$dhi		; d += h
+	and	$blo,$clo,$a2
+	 add	$t0,$hlo,$hlo
+	and	$bhi,$chi,$a3
+	 addc	$t1,$hhi,$hhi		; h += Sigma0(a)
+	xor	$a2,$a0,$a0
+	 add	$a0,$hlo,$hlo
+	xor	$a3,$a1,$a1		; Maj(a,b,c)
+	 addc	$a1,$hhi,$hhi		; h += Maj(a,b,c)
+
+___
+$code.=<<___ if ($i==15 && $flag);
+	extru	$Xlo,31,10,$Xlo
+	comiclr,= $LAST10BITS,$Xlo,%r0
+	b	L\$rounds_pa1
+	nop
+___
+push(@X,shift(@X)); push(@X,shift(@X));
+}
+
+sub ROUND_16_xx_pa1 {
+my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
+my ($i)=shift;
+$i-=16;
+$code.=<<___;
+	ldw	`-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
+	ldw	`-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo	; load X[i+1]
+	ldw	`-$XOFF+8*(($i+9)%16)`(%sp),$a1
+	ldw	`-$XOFF+8*(($i+9)%16)+4`(%sp),$a0	; load X[i+9]
+	ldw	`-$XOFF+8*(($i+14)%16)`(%sp),$a3
+	ldw	`-$XOFF+8*(($i+14)%16)+4`(%sp),$a2	; load X[i+14]
+	shd	$Xnhi,$Xnlo,$sigma0[0],$t0
+	shd	$Xnlo,$Xnhi,$sigma0[0],$t1
+	 add	$a0,$Xlo,$Xlo
+	shd	$Xnhi,$Xnlo,$sigma0[1],$t2
+	 addc	$a1,$Xhi,$Xhi
+	shd	$Xnlo,$Xnhi,$sigma0[1],$t3
+	xor	$t2,$t0,$t0
+	shd	$Xnhi,$Xnlo,$sigma0[2],$t2
+	xor	$t3,$t1,$t1
+	extru	$Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
+	xor	$t2,$t0,$t0
+	 shd	$a3,$a2,$sigma1[0],$a0
+	xor	$t3,$t1,$t1		; sigma0(X[i+1)&0x0f])
+	 shd	$a2,$a3,$sigma1[0],$a1
+	add	$t0,$Xlo,$Xlo
+	 shd	$a3,$a2,$sigma1[1],$t2
+	addc	$t1,$Xhi,$Xhi
+	 shd	$a2,$a3,$sigma1[1],$t3
+	xor	$t2,$a0,$a0
+	shd	$a3,$a2,$sigma1[2],$t2
+	xor	$t3,$a1,$a1
+	extru	$a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
+	xor	$t2,$a0,$a0
+	xor	$t3,$a1,$a1		; sigma0(X[i+14)&0x0f])
+	add	$a0,$Xlo,$Xlo
+	addc	$a1,$Xhi,$Xhi
+
+	stw	$Xhi,`-$XOFF+8*($i%16)`(%sp)
+	stw	$Xlo,`-$XOFF+8*($i%16)+4`(%sp)
+___
+&ROUND_00_15_pa1($i,@_,1);
+}
+$code.=<<___;
+	ldw	`0*4`($ctx),$Ahi		; load context
+	ldw	`1*4`($ctx),$Alo
+	ldw	`2*4`($ctx),$Bhi
+	ldw	`3*4`($ctx),$Blo
+	ldw	`4*4`($ctx),$Chi
+	ldw	`5*4`($ctx),$Clo
+	ldw	`6*4`($ctx),$Dhi
+	ldw	`7*4`($ctx),$Dlo
+	ldw	`8*4`($ctx),$Ehi
+	ldw	`9*4`($ctx),$Elo
+	ldw	`10*4`($ctx),$Fhi
+	ldw	`11*4`($ctx),$Flo
+	ldw	`12*4`($ctx),$Ghi
+	ldw	`13*4`($ctx),$Glo
+	ldw	`14*4`($ctx),$Hhi
+	ldw	`15*4`($ctx),$Hlo
+
+	extru	$inp,31,2,$t0
+	sh3addl	$t0,%r0,$t0
+	subi	32,$t0,$t0
+	mtctl	$t0,%cr11		; load %sar with align factor
+
+L\$oop_pa1
+	extru	$inp,31,2,$a3
+	comib,=	0,$a3,L\$aligned_pa1
+	sub	$inp,$a3,$inp
+
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	vshd	$X[0],$X[1],$X[0]
+	vshd	$X[1],$t2,$X[1]
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	vshd	$t2,$t3,$t2
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+	vshd	$t3,$a0,$t3
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<=(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+for (;$i<(128/4-1);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	vshd	$t[1],$t[2],$t[1]
+___
+push(@t,shift(@t));
+}
+$code.=<<___;
+	b	L\$collected_pa1
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+
+___
+}
+$code.=<<___;
+L\$aligned_pa1
+	ldw	`0*4`($inp),$X[0]
+	ldw	`1*4`($inp),$X[1]
+	ldw	`2*4`($inp),$t2
+	ldw	`3*4`($inp),$t3
+	ldw	`4*4`($inp),$a0
+	ldw	`5*4`($inp),$a1
+	ldw	`6*4`($inp),$a2
+	ldw	`7*4`($inp),$a3
+	stw	$X[0],`-$XOFF+0*4`(%sp)
+	ldw	`8*4`($inp),$t0
+	stw	$X[1],`-$XOFF+1*4`(%sp)
+	ldw	`9*4`($inp),$t1
+___
+{
+my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
+for ($i=2;$i<(128/4-8);$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+	ldw	`(8+$i)*4`($inp),$t[0]
+___
+push(@t,shift(@t));
+}
+for (;$i<128/4;$i++) {
+$code.=<<___;
+	stw	$t[0],`-$XOFF+$i*4`(%sp)
+___
+push(@t,shift(@t));
+}
+$code.="L\$collected_pa1\n";
+}
+
+for($i=0;$i<16;$i++)	{ &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+$code.="L\$rounds_pa1\n";
+for(;$i<32;$i++)	{ &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+	$POP	`-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx	; restore arguments
+	$POP	`-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
+	$POP	`-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
+	ldo	`-$rounds*$SZ`($Tbl),$Tbl		; rewind $Tbl
+
+	ldw	`0*4`($ctx),$t1		; update context
+	ldw	`1*4`($ctx),$t0
+	ldw	`2*4`($ctx),$t3
+	ldw	`3*4`($ctx),$t2
+	ldw	`4*4`($ctx),$a1
+	ldw	`5*4`($ctx),$a0
+	ldw	`6*4`($ctx),$a3
+	add	$t0,$Alo,$Alo
+	ldw	`7*4`($ctx),$a2
+	addc	$t1,$Ahi,$Ahi
+	ldw	`8*4`($ctx),$t1
+	add	$t2,$Blo,$Blo
+	ldw	`9*4`($ctx),$t0
+	addc	$t3,$Bhi,$Bhi
+	ldw	`10*4`($ctx),$t3
+	add	$a0,$Clo,$Clo
+	ldw	`11*4`($ctx),$t2
+	addc	$a1,$Chi,$Chi
+	ldw	`12*4`($ctx),$a1
+	add	$a2,$Dlo,$Dlo
+	ldw	`13*4`($ctx),$a0
+	addc	$a3,$Dhi,$Dhi
+	ldw	`14*4`($ctx),$a3
+	add	$t0,$Elo,$Elo
+	ldw	`15*4`($ctx),$a2
+	addc	$t1,$Ehi,$Ehi
+	stw	$Ahi,`0*4`($ctx)
+	add	$t2,$Flo,$Flo
+	stw	$Alo,`1*4`($ctx)
+	addc	$t3,$Fhi,$Fhi
+	stw	$Bhi,`2*4`($ctx)
+	add	$a0,$Glo,$Glo
+	stw	$Blo,`3*4`($ctx)
+	addc	$a1,$Ghi,$Ghi
+	stw	$Chi,`4*4`($ctx)
+	add	$a2,$Hlo,$Hlo
+	stw	$Clo,`5*4`($ctx)
+	addc	$a3,$Hhi,$Hhi
+	stw	$Dhi,`6*4`($ctx)
+	ldo	`16*$SZ`($inp),$inp	; advance $inp
+	stw	$Dlo,`7*4`($ctx)
+	stw	$Ehi,`8*4`($ctx)
+	stw	$Elo,`9*4`($ctx)
+	stw	$Fhi,`10*4`($ctx)
+	stw	$Flo,`11*4`($ctx)
+	stw	$Ghi,`12*4`($ctx)
+	stw	$Glo,`13*4`($ctx)
+	stw	$Hhi,`14*4`($ctx)
+	comb,=	$inp,$num,L\$done
+	stw	$Hlo,`15*4`($ctx)
+	b	L\$oop_pa1
+	$PUSH	$inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)	; save $inp
+L\$done
+___
+}}
+$code.=<<___;
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2		; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+	$POP	`-$FRAME+8*$SIZE_T`(%sp),%r11
+	$POP	`-$FRAME+9*$SIZE_T`(%sp),%r12
+	$POP	`-$FRAME+10*$SIZE_T`(%sp),%r13
+	$POP	`-$FRAME+11*$SIZE_T`(%sp),%r14
+	$POP	`-$FRAME+12*$SIZE_T`(%sp),%r15
+	$POP	`-$FRAME+13*$SIZE_T`(%sp),%r16
+	$POP	`-$FRAME+14*$SIZE_T`(%sp),%r17
+	$POP	`-$FRAME+15*$SIZE_T`(%sp),%r18
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
+    {	my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
+	$opcode|=(1<<3) if ($mod =~ /^,m/);
+	$opcode|=(1<<2) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
+    {	my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/)	# format 11
+    {	sprintf "\t.WORD\t0x%08x\t; %s",
+		(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
+		$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32)	# rotation for >=32
+		:       sprintf("shd\t%$1,%$2,%d",$3)/e			or
+	# translate made up instructons: _ror, _shr, _align, _shl
+	s/_ror(\s+)(%r[0-9]+),/
+		($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e			or
+
+	s/_shr(\s+%r[0-9]+),([0-9]+),/
+		$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
+		:        sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e	or
+
+	s/_align(\s+%r[0-9]+,%r[0-9]+),/
+		($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e		or
+
+	s/_shl(\s+%r[0-9]+),([0-9]+),/
+		$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
+		:            sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
+
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
+
+	s/cmpb,\*/comb,/ if ($SIZE_T==4);
+
+	print $_,"\n";
+}
+
+close STDOUT;
diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
index 768a6a6fad..6b44a68e59 100755
--- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl
@@ -40,6 +40,7 @@ $output =shift;
 
 if ($flavour =~ /64/) {
 	$SIZE_T=8;
+	$LRSAVE=2*$SIZE_T;
 	$STU="stdu";
 	$UCMP="cmpld";
 	$SHL="sldi";
@@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
 	$PUSH="std";
 } elsif ($flavour =~ /32/) {
 	$SIZE_T=4;
+	$LRSAVE=$SIZE_T;
 	$STU="stwu";
 	$UCMP="cmplw";
 	$SHL="slwi";
@@ -87,7 +89,8 @@ if ($output =~ /512/) {
 	$SHR="srwi";
 }
 
-$FRAME=32*$SIZE_T;
+$FRAME=32*$SIZE_T+16*$SZ;
+$LOCALS=6*$SIZE_T;
 
 $sp ="r1";
 $toc="r2";
@@ -179,13 +182,12 @@ $code=<<___;
 .globl	$func
 .align	6
 $func:
+	$STU	$sp,-$FRAME($sp)
 	mflr	r0
-	$STU	$sp,`-($FRAME+16*$SZ)`($sp)
 	$SHL	$num,$num,`log(16*$SZ)/log(2)`
 
 	$PUSH	$ctx,`$FRAME-$SIZE_T*22`($sp)
 
-	$PUSH	r0,`$FRAME-$SIZE_T*21`($sp)
 	$PUSH	$toc,`$FRAME-$SIZE_T*20`($sp)
 	$PUSH	r13,`$FRAME-$SIZE_T*19`($sp)
 	$PUSH	r14,`$FRAME-$SIZE_T*18`($sp)
@@ -206,6 +208,7 @@ $func:
 	$PUSH	r29,`$FRAME-$SIZE_T*3`($sp)
 	$PUSH	r30,`$FRAME-$SIZE_T*2`($sp)
 	$PUSH	r31,`$FRAME-$SIZE_T*1`($sp)
+	$PUSH	r0,`$FRAME+$LRSAVE`($sp)
 
 	$LD	$A,`0*$SZ`($ctx)
 	mr	$inp,r4				; incarnate $inp
@@ -217,7 +220,7 @@ $func:
 	$LD	$G,`6*$SZ`($ctx)
 	$LD	$H,`7*$SZ`($ctx)
 
-	b	LPICmeup
+	bl	LPICmeup
 LPICedup:
 	andi.	r0,$inp,3
 	bne	Lunaligned
@@ -226,40 +229,14 @@ Laligned:
 	$PUSH	$num,`$FRAME-$SIZE_T*24`($sp)	; end pointer
 	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
 	bl	Lsha2_block_private
-Ldone:
-	$POP	r0,`$FRAME-$SIZE_T*21`($sp)
-	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
-	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
-	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
-	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
-	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
-	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
-	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
-	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
-	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
-	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
-	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
-	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
-	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
-	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
-	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
-	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
-	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
-	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
-	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
-	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
-	mtlr	r0
-	addi	$sp,$sp,`$FRAME+16*$SZ`
-	blr
-___
+	b	Ldone
 
-# PowerPC specification allows an implementation to be ill-behaved
-# upon unaligned access which crosses page boundary. "Better safe
-# than sorry" principle makes me treat it specially. But I don't
-# look for particular offending word, but rather for the input
-# block which crosses the boundary. Once found that block is aligned
-# and hashed separately...
-$code.=<<___;
+; PowerPC specification allows an implementation to be ill-behaved
+; upon unaligned access which crosses page boundary. "Better safe
+; than sorry" principle makes me treat it specially. But I don't
+; look for particular offending word, but rather for the input
+; block which crosses the boundary. Once found that block is aligned
+; and hashed separately...
 .align	4
 Lunaligned:
 	subfic	$t1,$inp,4096
@@ -278,7 +255,7 @@ Lunaligned:
 Lcross_page:
 	li	$t1,`16*$SZ/4`
 	mtctr	$t1
-	addi	r20,$sp,$FRAME			; aligned spot below the frame
+	addi	r20,$sp,$LOCALS			; aligned spot below the frame
 Lmemcpy:
 	lbz	r16,0($inp)
 	lbz	r17,1($inp)
@@ -293,8 +270,8 @@ Lmemcpy:
 	bdnz	Lmemcpy
 
 	$PUSH	$inp,`$FRAME-$SIZE_T*26`($sp)	; save real inp
-	addi	$t1,$sp,`$FRAME+16*$SZ`		; fictitious end pointer
-	addi	$inp,$sp,$FRAME			; fictitious inp pointer
+	addi	$t1,$sp,`$LOCALS+16*$SZ`	; fictitious end pointer
+	addi	$inp,$sp,$LOCALS		; fictitious inp pointer
 	$PUSH	$num,`$FRAME-$SIZE_T*25`($sp)	; save real num
 	$PUSH	$t1,`$FRAME-$SIZE_T*24`($sp)	; end pointer
 	$PUSH	$inp,`$FRAME-$SIZE_T*23`($sp)	; inp pointer
@@ -303,10 +280,36 @@ Lmemcpy:
 	$POP	$num,`$FRAME-$SIZE_T*25`($sp)	; restore real num
 	addic.	$num,$num,`-16*$SZ`		; num--
 	bne-	Lunaligned
-	b	Ldone
-___
 
-$code.=<<___;
+Ldone:
+	$POP	r0,`$FRAME+$LRSAVE`($sp)
+	$POP	$toc,`$FRAME-$SIZE_T*20`($sp)
+	$POP	r13,`$FRAME-$SIZE_T*19`($sp)
+	$POP	r14,`$FRAME-$SIZE_T*18`($sp)
+	$POP	r15,`$FRAME-$SIZE_T*17`($sp)
+	$POP	r16,`$FRAME-$SIZE_T*16`($sp)
+	$POP	r17,`$FRAME-$SIZE_T*15`($sp)
+	$POP	r18,`$FRAME-$SIZE_T*14`($sp)
+	$POP	r19,`$FRAME-$SIZE_T*13`($sp)
+	$POP	r20,`$FRAME-$SIZE_T*12`($sp)
+	$POP	r21,`$FRAME-$SIZE_T*11`($sp)
+	$POP	r22,`$FRAME-$SIZE_T*10`($sp)
+	$POP	r23,`$FRAME-$SIZE_T*9`($sp)
+	$POP	r24,`$FRAME-$SIZE_T*8`($sp)
+	$POP	r25,`$FRAME-$SIZE_T*7`($sp)
+	$POP	r26,`$FRAME-$SIZE_T*6`($sp)
+	$POP	r27,`$FRAME-$SIZE_T*5`($sp)
+	$POP	r28,`$FRAME-$SIZE_T*4`($sp)
+	$POP	r29,`$FRAME-$SIZE_T*3`($sp)
+	$POP	r30,`$FRAME-$SIZE_T*2`($sp)
+	$POP	r31,`$FRAME-$SIZE_T*1`($sp)
+	mtlr	r0
+	addi	$sp,$sp,$FRAME
+	blr
+	.long	0
+	.byte	0,12,4,1,0x80,18,3,0
+	.long	0
+
 .align	4
 Lsha2_block_private:
 ___
@@ -372,6 +375,8 @@ $code.=<<___;
 	$ST	$H,`7*$SZ`($ctx)
 	bne	Lsha2_block_private
 	blr
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
 ___
 
 # Ugly hack here, because PPC assembler syntax seem to vary too
@@ -379,22 +384,15 @@ ___
 $code.=<<___;
 .align	6
 LPICmeup:
-	bl	LPIC
-	addi	$Tbl,$Tbl,`64-4`	; "distance" between . and last nop
-	b	LPICedup
-	nop
-	nop
-	nop
-	nop
-	nop
-LPIC:	mflr	$Tbl
+	mflr	r0
+	bcl	20,31,\$+4
+	mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
+	addi	$Tbl,$Tbl,`64-8`
+	mtlr	r0
 	blr
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
+	.long	0
+	.byte	0,12,0x14,0,0,0,0,0
+	.space	`64-9*4`
 ___
 $code.=<<___ if ($SZ==8);
 	.long	0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
index e7ef2d5a9f..079a3fc78a 100644
--- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl
@@ -26,6 +26,26 @@
 # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
 # than software.
 
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
 $t0="%r0";
 $t1="%r1";
 $ctx="%r2";	$t2="%r2";
@@ -44,7 +64,7 @@ $tbl="%r13";
 $T1="%r14";
 $sp="%r15";
 
-$output=shift;
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
 if ($output =~ /512/) {
@@ -78,7 +98,8 @@ if ($output =~ /512/) {
 }
 $Func="sha${label}_block_data_order";
 $Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;
 
 sub BODY_00_15 {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@ $code.=<<___;
 	xgr	$t0,$t1
 	$ROT	$t1,$t1,`$Sigma1[2]-$Sigma1[1]`
 	 xgr	$t2,$g
-	$ST	$T1,`160+$SZ*($i%16)`($sp)
+	$ST	$T1,`$stdframe+$SZ*($i%16)`($sp)
 	xgr	$t0,$t1			# Sigma1(e)
-	la	$T1,0($T1,$h)		# T1+=h
+	algr	$T1,$h			# T1+=h
 	 ngr	$t2,$e
 	 lgr	$t1,$a
 	algr	$T1,$t0			# T1+=Sigma1(e)
@@ -113,7 +134,7 @@ $code.=<<___;
 	 ngr	$t2,$b
 	algr	$h,$T1			# h+=T1
 	 ogr	$t2,$t1			# Maj(a,b,c)
-	la	$d,0($d,$T1)		# d+=T1
+	algr	$d,$T1			# d+=T1
 	algr	$h,$t2			# h+=Maj(a,b,c)
 ___
 }
@@ -122,19 +143,19 @@ sub BODY_16_XX {
 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	$LD	$T1,`160+$SZ*(($i+1)%16)`($sp)	### $i
-	$LD	$t1,`160+$SZ*(($i+14)%16)`($sp)
+	$LD	$T1,`$stdframe+$SZ*(($i+1)%16)`($sp)	### $i
+	$LD	$t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
 	$ROT	$t0,$T1,$sigma0[0]
 	$SHR	$T1,$sigma0[2]
 	$ROT	$t2,$t0,`$sigma0[1]-$sigma0[0]`
 	xgr	$T1,$t0
 	$ROT	$t0,$t1,$sigma1[0]
-	xgr	$T1,$t2				# sigma0(X[i+1])
+	xgr	$T1,$t2					# sigma0(X[i+1])
 	$SHR	$t1,$sigma1[2]
-	$ADD	$T1,`160+$SZ*($i%16)`($sp)	# +=X[i]
+	$ADD	$T1,`$stdframe+$SZ*($i%16)`($sp)	# +=X[i]
 	xgr	$t1,$t0
 	$ROT	$t0,$t0,`$sigma1[1]-$sigma1[0]`
-	$ADD	$T1,`160+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
+	$ADD	$T1,`$stdframe+$SZ*(($i+9)%16)`($sp)	# +=X[i+9]
 	xgr	$t1,$t0				# sigma1(X[i+14])
 	algr	$T1,$t1				# +=sigma1(X[i+14])
 ___
@@ -212,6 +233,7 @@ $code.=<<___;
 .globl	$Func
 .type	$Func,\@function
 $Func:
+	sllg	$len,$len,`log(16*$SZ)/log(2)`
 ___
 $code.=<<___ if ($kimdfunc);
 	larl	%r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc);
 	tmhl	%r0,0x4000	# check for message-security assist
 	jz	.Lsoftware
 	lghi	%r0,0
-	la	%r1,16($sp)
+	la	%r1,`2*$SIZE_T`($sp)
 	.long	0xb93e0002	# kimd %r0,%r2
-	lg	%r0,16($sp)
+	lg	%r0,`2*$SIZE_T`($sp)
 	tmhh	%r0,`0x8000>>$kimdfunc`
 	jz	.Lsoftware
 	lghi	%r0,$kimdfunc
 	lgr	%r1,$ctx
 	lgr	%r2,$inp
-	sllg	%r3,$len,`log(16*$SZ)/log(2)`
+	lgr	%r3,$len
 	.long	0xb93e0002	# kimd %r0,%r2
 	brc	1,.-4		# pay attention to "partial completion"
 	br	%r14
@@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc);
 .Lsoftware:
 ___
 $code.=<<___;
-	sllg	$len,$len,`log(16*$SZ)/log(2)`
 	lghi	%r1,-$frame
-	agr	$len,$inp
-	stmg	$ctx,%r15,16($sp)
+	la	$len,0($len,$inp)
+	stm${g}	$ctx,%r15,`2*$SIZE_T`($sp)
 	lgr	%r0,$sp
 	la	$sp,0(%r1,$sp)
-	stg	%r0,0($sp)
+	st${g}	%r0,0($sp)
 
 	larl	$tbl,$Table
 	$LD	$A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@ $code.=<<___;
 	clgr	$len,$t0
 	jne	.Lrounds_16_xx
 
-	lg	$ctx,`$frame+16`($sp)
+	l${g}	$ctx,`$frame+2*$SIZE_T`($sp)
 	la	$inp,`16*$SZ`($inp)
 	$ADD	$A,`0*$SZ`($ctx)
 	$ADD	$B,`1*$SZ`($ctx)
@@ -283,14 +304,14 @@ $code.=<<___;
 	$ST	$F,`5*$SZ`($ctx)
 	$ST	$G,`6*$SZ`($ctx)
 	$ST	$H,`7*$SZ`($ctx)
-	clg	$inp,`$frame+32`($sp)
+	cl${g}	$inp,`$frame+4*$SIZE_T`($sp)
 	jne	.Lloop
 
-	lmg	%r6,%r15,`$frame+48`($sp)	
+	lm${g}	%r6,%r15,`$frame+6*$SIZE_T`($sp)	
 	br	%r14
 .size	$Func,.-$Func
 .string	"SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.comm	OPENSSL_s390xcap_P,8,8
+.comm	OPENSSL_s390xcap_P,16,8
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
index ec5d78135e..585740789e 100644
--- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl
@@ -305,9 +305,9 @@ $code.=<<___;
 	srlx	@X[(($i+9)/2)%8],32,$tmp1	! X[i+9]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
 	srl	@X[($i/2)%8],0,$tmp0
+	add	$tmp2,$tmp1,$tmp1
 	add	$xi,$T1,$T1			! +=X[i]
 	xor	$tmp0,@X[($i/2)%8],@X[($i/2)%8]
-	add	$tmp2,$T1,$T1
 	add	$tmp1,$T1,$T1
 
 	srl	$T1,0,$T1
@@ -318,9 +318,9 @@ ___
 $code.=<<___;
 	srlx	@X[($i/2)%8],32,$tmp1		! X[i]
 	xor	$tmp0,$tmp2,$tmp2		! sigma1(X[i+14])
-	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$xi,$T1,$T1			! +=X[i+9]
-	add	$tmp2,$T1,$T1
+	add	$tmp2,$tmp1,$tmp1
+	srl	@X[($i/2)%8],0,@X[($i/2)%8]
 	add	$tmp1,$T1,$T1
 
 	sllx	$T1,32,$tmp0
diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
index e6643f8cf6..f611a2d898 100755
--- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
+++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl
@@ -95,50 +95,44 @@ sub ROUND_00_15()
 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-	mov	$e,$a0
-	mov	$e,$a1
+	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
 	mov	$f,$a2
+	mov	$T1,`$SZ*($i&0xf)`(%rsp)
 
-	ror	\$$Sigma1[0],$a0
-	ror	\$$Sigma1[1],$a1
+	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
+	xor	$e,$a0
 	xor	$g,$a2			# f^g
 
-	xor	$a1,$a0
-	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a1
+	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
+	add	$h,$T1			# T1+=h
+	xor	$a,$a1
+
+	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
 	and	$e,$a2			# (f^g)&e
-	mov	$T1,`$SZ*($i&0xf)`(%rsp)
+	mov	$b,$h
 
-	xor	$a1,$a0			# Sigma1(e)
+	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
+	xor	$e,$a0
 	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
-	add	$h,$T1			# T1+=h
-
-	mov	$a,$h
-	add	$a0,$T1			# T1+=Sigma1(e)
 
+	xor	$c,$h			# b^c
+	xor	$a,$a1
 	add	$a2,$T1			# T1+=Ch(e,f,g)
-	mov	$a,$a0
-	mov	$a,$a1
+	mov	$b,$a2
 
-	ror	\$$Sigma0[0],$h
-	ror	\$$Sigma0[1],$a0
-	mov	$a,$a2
-	add	($Tbl,$round,$SZ),$T1	# T1+=K[round]
+	ror	\$$Sigma1[0],$a0	# Sigma1(e)
+	and	$a,$h			# h=(b^c)&a
+	and	$c,$a2			# b&c
 
-	xor	$a0,$h
-	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a0
-	or	$c,$a1			# a|c
+	ror	\$$Sigma0[0],$a1	# Sigma0(a)
+	add	$a0,$T1			# T1+=Sigma1(e)
+	add	$a2,$h			# h+=b&c (completes +=Maj(a,b,c)
 
-	xor	$a0,$h			# h=Sigma0(a)
-	and	$c,$a2			# a&c
 	add	$T1,$d			# d+=T1
-
-	and	$b,$a1			# (a|c)&b
 	add	$T1,$h			# h+=T1
-
-	or	$a2,$a1			# Maj(a,b,c)=((a|c)&b)|(a&c)
 	lea	1($round),$round	# round++
+	add	$a1,$h			# h+=Sigma0(a)
 
-	add	$a1,$h			# h+=Maj(a,b,c)
 ___
 }
 
@@ -147,32 +141,30 @@ sub ROUND_16_XX()
 
 $code.=<<___;
 	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
-	mov	`$SZ*(($i+14)&0xf)`(%rsp),$T1
-
-	mov	$a0,$a2
+	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a1
+	mov	$a0,$T1
+	mov	$a1,$a2
 
+	ror	\$`$sigma0[1]-$sigma0[0]`,$T1
+	xor	$a0,$T1
 	shr	\$$sigma0[2],$a0
-	ror	\$$sigma0[0],$a2
-
-	xor	$a2,$a0
-	ror	\$`$sigma0[1]-$sigma0[0]`,$a2
 
-	xor	$a2,$a0			# sigma0(X[(i+1)&0xf])
-	mov	$T1,$a1
+	ror	\$$sigma0[0],$T1
+	xor	$T1,$a0			# sigma0(X[(i+1)&0xf])
+	mov	`$SZ*(($i+9)&0xf)`(%rsp),$T1
 
-	shr	\$$sigma1[2],$T1
-	ror	\$$sigma1[0],$a1
-
-	xor	$a1,$T1
-	ror	\$`$sigma1[1]-$sigma1[0]`,$a1
-
-	xor	$a1,$T1			# sigma1(X[(i+14)&0xf])
+	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
+	xor	$a1,$a2
+	shr	\$$sigma1[2],$a1
 
+	ror	\$$sigma1[0],$a2
 	add	$a0,$T1
-
-	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
+	xor	$a2,$a1			# sigma1(X[(i+14)&0xf])
 
 	add	`$SZ*($i&0xf)`(%rsp),$T1
+	mov	$e,$a0
+	add	$a1,$T1
+	mov	$a,$a1
 ___
 	&ROUND_00_15(@_);
 }
@@ -219,6 +211,8 @@ $func:
 ___
 	for($i=0;$i<16;$i++) {
 		$code.="	mov	$SZ*$i($inp),$T1\n";
+		$code.="	mov	@ROT[4],$a0\n";
+		$code.="	mov	@ROT[0],$a1\n";
 		$code.="	bswap	$T1\n";
 		&ROUND_00_15($i,@ROT);
 		unshift(@ROT,pop(@ROT));
diff --git a/src/lib/libcrypto/sha/sha.h b/src/lib/libcrypto/sha/sha.h
index 16cacf9fc0..8a6bf4bbbb 100644
--- a/src/lib/libcrypto/sha/sha.h
+++ b/src/lib/libcrypto/sha/sha.h
@@ -106,6 +106,9 @@ typedef struct SHAstate_st
 	} SHA_CTX;
 
 #ifndef OPENSSL_NO_SHA0
+#ifdef OPENSSL_FIPS
+int private_SHA_Init(SHA_CTX *c);
+#endif
 int SHA_Init(SHA_CTX *c);
 int SHA_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA_Final(unsigned char *md, SHA_CTX *c);
@@ -113,6 +116,9 @@ unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md);
 void SHA_Transform(SHA_CTX *c, const unsigned char *data);
 #endif
 #ifndef OPENSSL_NO_SHA1
+#ifdef OPENSSL_FIPS
+int private_SHA1_Init(SHA_CTX *c);
+#endif
 int SHA1_Init(SHA_CTX *c);
 int SHA1_Update(SHA_CTX *c, const void *data, size_t len);
 int SHA1_Final(unsigned char *md, SHA_CTX *c);
@@ -135,6 +141,10 @@ typedef struct SHA256state_st
 	} SHA256_CTX;
 
 #ifndef OPENSSL_NO_SHA256
+#ifdef OPENSSL_FIPS
+int private_SHA224_Init(SHA256_CTX *c);
+int private_SHA256_Init(SHA256_CTX *c);
+#endif
 int SHA224_Init(SHA256_CTX *c);
 int SHA224_Update(SHA256_CTX *c, const void *data, size_t len);
 int SHA224_Final(unsigned char *md, SHA256_CTX *c);
@@ -182,6 +192,10 @@ typedef struct SHA512state_st
 #endif
 
 #ifndef OPENSSL_NO_SHA512
+#ifdef OPENSSL_FIPS
+int private_SHA384_Init(SHA512_CTX *c);
+int private_SHA512_Init(SHA512_CTX *c);
+#endif
 int SHA384_Init(SHA512_CTX *c);
 int SHA384_Update(SHA512_CTX *c, const void *data, size_t len);
 int SHA384_Final(unsigned char *md, SHA512_CTX *c);
diff --git a/src/lib/libcrypto/sha/sha1dgst.c b/src/lib/libcrypto/sha/sha1dgst.c
index 50d1925cde..81219af088 100644
--- a/src/lib/libcrypto/sha/sha1dgst.c
+++ b/src/lib/libcrypto/sha/sha1dgst.c
@@ -57,6 +57,7 @@
  */
 
 #include <openssl/opensslconf.h>
+#include <openssl/crypto.h>
 #if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA)
 
 #undef  SHA_0
diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c
index 8952d87673..f88d3d6dad 100644
--- a/src/lib/libcrypto/sha/sha256.c
+++ b/src/lib/libcrypto/sha/sha256.c
@@ -16,7 +16,7 @@
 
 const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT;
 
-int SHA224_Init (SHA256_CTX *c)
+fips_md_init_ctx(SHA224, SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0xc1059ed8UL;	c->h[1]=0x367cd507UL;
@@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c)
 	return 1;
 	}
 
-int SHA256_Init (SHA256_CTX *c)
+fips_md_init(SHA256)
 	{
 	memset (c,0,sizeof(*c));
 	c->h[0]=0x6a09e667UL;	c->h[1]=0xbb67ae85UL;
diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c
index cbc0e58c48..50dd7dc744 100644
--- a/src/lib/libcrypto/sha/sha512.c
+++ b/src/lib/libcrypto/sha/sha512.c
@@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
 #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA
 #endif
 
-int SHA384_Init (SHA512_CTX *c)
+fips_md_init_ctx(SHA384, SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0xcbbb9d5d; h[1]  = 0xc1059ed8;
-	h[2]  = 0x629a292a; h[3]  = 0x367cd507;
-	h[4]  = 0x9159015a; h[5]  = 0x3070dd17;
-	h[6]  = 0x152fecd8; h[7]  = 0xf70e5939;
-	h[8]  = 0x67332667; h[9]  = 0xffc00b31;
-	h[10] = 0x8eb44a87; h[11] = 0x68581511;
-	h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
-	h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
-#else
 	c->h[0]=U64(0xcbbb9d5dc1059ed8);
 	c->h[1]=U64(0x629a292a367cd507);
 	c->h[2]=U64(0x9159015a3070dd17);
@@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c)
 	c->h[5]=U64(0x8eb44a8768581511);
 	c->h[6]=U64(0xdb0c2e0d64f98fa7);
 	c->h[7]=U64(0x47b5481dbefa4fa4);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA384_DIGEST_LENGTH;
         return 1;
 	}
 
-int SHA512_Init (SHA512_CTX *c)
+fips_md_init(SHA512)
 	{
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* maintain dword order required by assembler module */
-	unsigned int *h = (unsigned int *)c->h;
-
-	h[0]  = 0x6a09e667; h[1]  = 0xf3bcc908;
-	h[2]  = 0xbb67ae85; h[3]  = 0x84caa73b;
-	h[4]  = 0x3c6ef372; h[5]  = 0xfe94f82b;
-	h[6]  = 0xa54ff53a; h[7]  = 0x5f1d36f1;
-	h[8]  = 0x510e527f; h[9]  = 0xade682d1;
-	h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
-	h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
-	h[14] = 0x5be0cd19; h[15] = 0x137e2179;
-#else
 	c->h[0]=U64(0x6a09e667f3bcc908);
 	c->h[1]=U64(0xbb67ae8584caa73b);
 	c->h[2]=U64(0x3c6ef372fe94f82b);
@@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
 	c->h[5]=U64(0x9b05688c2b3e6c1f);
 	c->h[6]=U64(0x1f83d9abfb41bd6b);
 	c->h[7]=U64(0x5be0cd19137e2179);
-#endif
+
         c->Nl=0;        c->Nh=0;
         c->num=0;       c->md_len=SHA512_DIGEST_LENGTH;
         return 1;
@@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
 
 	if (md==0) return 0;
 
-#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
-	/* recall assembler dword order... */
-	n = c->md_len;
-	if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
-		{
-		unsigned int *h = (unsigned int *)c->h, t;
-
-		for (n/=4;n;n--)
-			{
-			t = *(h++);
-			*(md++) = (unsigned char)(t>>24);
-			*(md++) = (unsigned char)(t>>16);
-			*(md++) = (unsigned char)(t>>8);
-			*(md++) = (unsigned char)(t);
-			}
-		}
-	else	return 0;
-#else
 	switch (c->md_len)
 		{
 		/* Let compiler decide if it's appropriate to unroll... */
@@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
 		/* ... as well as make sure md_len is not abused. */
 		default:	return 0;
 		}
-#endif
+
 	return 1;
 	}
 
diff --git a/src/lib/libcrypto/sha/sha_locl.h b/src/lib/libcrypto/sha/sha_locl.h
index 672c26eee1..7a0c3ca8d8 100644
--- a/src/lib/libcrypto/sha/sha_locl.h
+++ b/src/lib/libcrypto/sha/sha_locl.h
@@ -122,7 +122,11 @@ void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num);
 #define INIT_DATA_h3 0x10325476UL
 #define INIT_DATA_h4 0xc3d2e1f0UL
 
-int HASH_INIT (SHA_CTX *c)
+#ifdef SHA_0
+fips_md_init(SHA)
+#else
+fips_md_init_ctx(SHA1, SHA)
+#endif
 	{
 	memset (c,0,sizeof(*c));
 	c->h0=INIT_DATA_h0;
diff --git a/src/lib/libcrypto/sparcv9cap.c b/src/lib/libcrypto/sparcv9cap.c
index ed195ab402..43b3ac6f81 100644
--- a/src/lib/libcrypto/sparcv9cap.c
+++ b/src/lib/libcrypto/sparcv9cap.c
@@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U
 	int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 	int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
 
-	if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
+	if (num>=8 && !(num&1) &&
+	    (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) ==
 		(SPARCV9_PREFER_FPU|SPARCV9_VIS1))
 		return bn_mul_mont_fpu(rp,ap,bp,np,n0,num);
 	else
@@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void)
 	char *e;
 	struct sigaction	common_act,ill_oact,bus_oact;
 	sigset_t		all_masked,oset;
-	int			sig;
 	static int trigger=0;
 
 	if (trigger) return;
diff --git a/src/lib/libcrypto/stack/safestack.h b/src/lib/libcrypto/stack/safestack.h
index 3e76aa58f5..ea3aa0d800 100644
--- a/src/lib/libcrypto/stack/safestack.h
+++ b/src/lib/libcrypto/stack/safestack.h
@@ -1459,6 +1459,94 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st))
 #define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st))
 
+#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp))
+#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN)
+#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st))
+#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st))
+#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i))
+#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val))
+#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st))
+#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val))
+#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val))
+#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val))
+#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val))
+#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i))
+#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr))
+#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i))
+#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp))
+#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st)
+#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func))
+#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st))
+#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st))
+#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st))
+#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st))
+
+#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp))
+#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache)
+#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val))
+#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val))
+#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i))
+#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr))
+#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i))
+#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp))
+#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st)
+#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func))
+#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st))
+#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st))
+
+#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp))
+#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd)
+#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val))
+#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val))
+#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i))
+#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr))
+#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i))
+#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp))
+#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st)
+#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func))
+#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st))
+#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st))
+
+#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE)
+#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val))
+#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val))
+#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i))
+#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr))
+#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i))
+#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp))
+#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st)
+#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func))
+#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st))
+#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st))
+
 #define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp))
 #define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER)
 #define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st))
@@ -2056,31 +2144,6 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st))
 
 
-#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
-#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
-#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
-#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
-#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
-#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
-#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
-#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
-	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
-	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
-#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
-#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
-#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
-#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
-
-
 #define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp)))
 #define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null())
 #define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val))
@@ -2106,6 +2169,31 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void)
 #define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st))
 
 
+#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null())
+#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i))
+#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func))
+#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i)
+#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val))
+#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i))
+#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr))
+#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp)  \
+	((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \
+	sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp)))
+#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st)
+#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st))
+#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st))
+#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st))
+
+
 #define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \
 	SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) 
 #define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \
diff --git a/src/lib/libcrypto/ts/ts.h b/src/lib/libcrypto/ts/ts.h
index 190e8a1bf2..c2448e3c3b 100644
--- a/src/lib/libcrypto/ts/ts.h
+++ b/src/lib/libcrypto/ts/ts.h
@@ -86,9 +86,6 @@
 #include <openssl/dh.h>
 #endif
 
-#include <openssl/evp.h>
-
-
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/src/lib/libcrypto/ts/ts_rsp_verify.c b/src/lib/libcrypto/ts/ts_rsp_verify.c
index e1f3b534af..afe16afbe4 100644
--- a/src/lib/libcrypto/ts/ts_rsp_verify.c
+++ b/src/lib/libcrypto/ts/ts_rsp_verify.c
@@ -614,12 +614,15 @@ static int TS_compute_imprint(BIO *data, TS_TST_INFO *tst_info,
 		goto err;
 		}
 
-	EVP_DigestInit(&md_ctx, md);
+	if (!EVP_DigestInit(&md_ctx, md))
+		goto err;
 	while ((length = BIO_read(data, buffer, sizeof(buffer))) > 0)
 		{
-		EVP_DigestUpdate(&md_ctx, buffer, length);
+		if (!EVP_DigestUpdate(&md_ctx, buffer, length))
+			goto err;
 		}
-	EVP_DigestFinal(&md_ctx, *imprint, NULL);
+	if (!EVP_DigestFinal(&md_ctx, *imprint, NULL))
+		goto err;
 
 	return 1;
  err:
diff --git a/src/lib/libcrypto/ui/ui.h b/src/lib/libcrypto/ui/ui.h
index 2b1cfa2289..bd78aa413f 100644
--- a/src/lib/libcrypto/ui/ui.h
+++ b/src/lib/libcrypto/ui/ui.h
@@ -316,7 +316,7 @@ int (*UI_method_get_writer(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_flusher(UI_METHOD *method))(UI*);
 int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*);
 int (*UI_method_get_closer(UI_METHOD *method))(UI*);
-char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
+char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*);
 
 /* The following functions are helpers for method writers to access relevant
    data from a UI_STRING. */
diff --git a/src/lib/libcrypto/ui/ui_openssl.c b/src/lib/libcrypto/ui/ui_openssl.c
index 1bc25f48d5..5832a73cf5 100644
--- a/src/lib/libcrypto/ui/ui_openssl.c
+++ b/src/lib/libcrypto/ui/ui_openssl.c
@@ -122,7 +122,7 @@
  * sigaction and fileno included. -pedantic would be more appropriate for
  * the intended purposes, but we can't prevent users from adding -ansi.
  */
-#ifndef _POSIX_C_SOURCE
+#if !defined(_POSIX_C_SOURCE) && defined(OPENSSL_SYS_VMS)
 #define _POSIX_C_SOURCE 2
 #endif
 #include <signal.h>
diff --git a/src/lib/libcrypto/whrlpool/whrlpool.h b/src/lib/libcrypto/whrlpool/whrlpool.h
index 03c91da115..9e01f5b076 100644
--- a/src/lib/libcrypto/whrlpool/whrlpool.h
+++ b/src/lib/libcrypto/whrlpool/whrlpool.h
@@ -24,6 +24,9 @@ typedef struct	{
 	} WHIRLPOOL_CTX;
 
 #ifndef OPENSSL_NO_WHIRLPOOL
+#ifdef OPENSSL_FIPS
+int private_WHIRLPOOL_Init(WHIRLPOOL_CTX *c);
+#endif
 int WHIRLPOOL_Init	(WHIRLPOOL_CTX *c);
 int WHIRLPOOL_Update	(WHIRLPOOL_CTX *c,const void *inp,size_t bytes);
 void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits);
diff --git a/src/lib/libcrypto/whrlpool/wp_block.c b/src/lib/libcrypto/whrlpool/wp_block.c
index 221f6cc59f..824ed1827c 100644
--- a/src/lib/libcrypto/whrlpool/wp_block.c
+++ b/src/lib/libcrypto/whrlpool/wp_block.c
@@ -68,9 +68,9 @@ typedef unsigned long long	u64;
 					   CPUs this is actually faster! */
 #    endif
 #    define GO_FOR_MMX(ctx,inp,num)	do {			\
-	extern unsigned long OPENSSL_ia32cap_P;			\
+	extern unsigned int OPENSSL_ia32cap_P[];		\
 	void whirlpool_block_mmx(void *,const void *,size_t);	\
-	if (!(OPENSSL_ia32cap_P & (1<<23)))	break;		\
+	if (!(OPENSSL_ia32cap_P[0] & (1<<23)))	break;		\
         whirlpool_block_mmx(ctx->H.c,inp,num);	return;		\
 					} while (0)
 #  endif
diff --git a/src/lib/libcrypto/whrlpool/wp_dgst.c b/src/lib/libcrypto/whrlpool/wp_dgst.c
index ee5c5c1bf3..7e28bef51d 100644
--- a/src/lib/libcrypto/whrlpool/wp_dgst.c
+++ b/src/lib/libcrypto/whrlpool/wp_dgst.c
@@ -52,9 +52,10 @@
  */
 
 #include "wp_locl.h"
+#include <openssl/crypto.h>
 #include <string.h>
 
-int WHIRLPOOL_Init	(WHIRLPOOL_CTX *c)
+fips_md_init(WHIRLPOOL)
 	{
 	memset (c,0,sizeof(*c));
 	return(1);
diff --git a/src/lib/libcrypto/x509/x509.h b/src/lib/libcrypto/x509/x509.h
index e6f8a40395..092dd7450d 100644
--- a/src/lib/libcrypto/x509/x509.h
+++ b/src/lib/libcrypto/x509/x509.h
@@ -657,11 +657,15 @@ int NETSCAPE_SPKI_set_pubkey(NETSCAPE_SPKI *x, EVP_PKEY *pkey);
 
 int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki);
 
+int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent);
 int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig);
 
 int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx);
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx);
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md);
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx);
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md);
 
 int X509_pubkey_digest(const X509 *data,const EVP_MD *type,
@@ -763,6 +767,7 @@ X509_ALGOR *X509_ALGOR_dup(X509_ALGOR *xn);
 int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval);
 void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval,
 						X509_ALGOR *algor);
+void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md);
 
 X509_NAME *X509_NAME_dup(X509_NAME *xn);
 X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne);
@@ -896,6 +901,9 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *algor1,
 int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2,
 	ASN1_BIT_STRING *signature,
 	void *data, EVP_PKEY *pkey, const EVP_MD *type);
+int ASN1_item_sign_ctx(const ASN1_ITEM *it,
+		X509_ALGOR *algor1, X509_ALGOR *algor2,
+	     	ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx);
 #endif
 
 int 		X509_set_version(X509 *x,long version);
@@ -1161,6 +1169,9 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter,
 				 unsigned char *salt, int saltlen,
 				 unsigned char *aiv, int prf_nid);
 
+X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen,
+				int prf_nid, int keylen);
+
 /* PKCS#8 utilities */
 
 DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO)
diff --git a/src/lib/libcrypto/x509/x509_cmp.c b/src/lib/libcrypto/x509/x509_cmp.c
index 4bc9da07e0..7c2aaee2e9 100644
--- a/src/lib/libcrypto/x509/x509_cmp.c
+++ b/src/lib/libcrypto/x509/x509_cmp.c
@@ -87,15 +87,20 @@ unsigned long X509_issuer_and_serial_hash(X509 *a)
 	EVP_MD_CTX_init(&ctx);
 	f=X509_NAME_oneline(a->cert_info->issuer,NULL,0);
 	ret=strlen(f);
-	EVP_DigestInit_ex(&ctx, EVP_md5(), NULL);
-	EVP_DigestUpdate(&ctx,(unsigned char *)f,ret);
+	if (!EVP_DigestInit_ex(&ctx, EVP_md5(), NULL))
+		goto err;
+	if (!EVP_DigestUpdate(&ctx,(unsigned char *)f,ret))
+		goto err;
 	OPENSSL_free(f);
-	EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
-		(unsigned long)a->cert_info->serialNumber->length);
-	EVP_DigestFinal_ex(&ctx,&(md[0]),NULL);
+	if(!EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data,
+		(unsigned long)a->cert_info->serialNumber->length))
+		goto err;
+	if (!EVP_DigestFinal_ex(&ctx,&(md[0]),NULL))
+		goto err;
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
 		)&0xffffffffL;
+	err:
 	EVP_MD_CTX_cleanup(&ctx);
 	return(ret);
 	}
@@ -219,7 +224,9 @@ unsigned long X509_NAME_hash(X509_NAME *x)
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(), NULL);
+	if (!EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(),
+		NULL))
+		return 0;
 
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
@@ -234,12 +241,18 @@ unsigned long X509_NAME_hash(X509_NAME *x)
 
 unsigned long X509_NAME_hash_old(X509_NAME *x)
 	{
+	EVP_MD_CTX md_ctx;
 	unsigned long ret=0;
 	unsigned char md[16];
 
 	/* Make sure X509_NAME structure contains valid cached encoding */
 	i2d_X509_NAME(x,NULL);
-	EVP_Digest(x->bytes->data, x->bytes->length, md, NULL, EVP_md5(), NULL);
+	EVP_MD_CTX_init(&md_ctx);
+	EVP_MD_CTX_set_flags(&md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	EVP_DigestInit_ex(&md_ctx, EVP_md5(), NULL);
+	EVP_DigestUpdate(&md_ctx, x->bytes->data, x->bytes->length);
+	EVP_DigestFinal_ex(&md_ctx,md,NULL);
+	EVP_MD_CTX_cleanup(&md_ctx);
 
 	ret=(	((unsigned long)md[0]     )|((unsigned long)md[1]<<8L)|
 		((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L)
diff --git a/src/lib/libcrypto/x509/x509_lu.c b/src/lib/libcrypto/x509/x509_lu.c
index 3a6e04a1de..38525a8cdd 100644
--- a/src/lib/libcrypto/x509/x509_lu.c
+++ b/src/lib/libcrypto/x509/x509_lu.c
@@ -87,7 +87,7 @@ void X509_LOOKUP_free(X509_LOOKUP *ctx)
 	if (ctx == NULL) return;
 	if (	(ctx->method != NULL) &&
 		(ctx->method->free != NULL))
-		ctx->method->free(ctx);
+		(*ctx->method->free)(ctx);
 	OPENSSL_free(ctx);
 	}
 
diff --git a/src/lib/libcrypto/x509/x509_vfy.c b/src/lib/libcrypto/x509/x509_vfy.c
index 701ec565e9..b0779db023 100644
--- a/src/lib/libcrypto/x509/x509_vfy.c
+++ b/src/lib/libcrypto/x509/x509_vfy.c
@@ -153,7 +153,6 @@ static int x509_subject_cmp(X509 **a, X509 **b)
 int X509_verify_cert(X509_STORE_CTX *ctx)
 	{
 	X509 *x,*xtmp,*chain_ss=NULL;
-	X509_NAME *xn;
 	int bad_chain = 0;
 	X509_VERIFY_PARAM *param = ctx->param;
 	int depth,i,ok=0;
@@ -205,7 +204,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		                         */
 
 		/* If we are self signed, we break */
-		xn=X509_get_issuer_name(x);
 		if (ctx->check_issued(ctx, x,x)) break;
 
 		/* If we were passed a cert chain, use it first */
@@ -242,7 +240,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 
 	i=sk_X509_num(ctx->chain);
 	x=sk_X509_value(ctx->chain,i-1);
-	xn = X509_get_subject_name(x);
 	if (ctx->check_issued(ctx, x, x))
 		{
 		/* we have a self signed certificate */
@@ -291,7 +288,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		if (depth < num) break;
 
 		/* If we are self signed, we break */
-		xn=X509_get_issuer_name(x);
 		if (ctx->check_issued(ctx,x,x)) break;
 
 		ok = ctx->get_issuer(&xtmp, ctx, x);
@@ -310,7 +306,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx)
 		}
 
 	/* we now have our chain, lets check it... */
-	xn=X509_get_issuer_name(x);
 
 	/* Is last certificate looked up self signed? */
 	if (!ctx->check_issued(ctx,x,x))
diff --git a/src/lib/libcrypto/x509/x509type.c b/src/lib/libcrypto/x509/x509type.c
index 3385ad3f67..9702ec5310 100644
--- a/src/lib/libcrypto/x509/x509type.c
+++ b/src/lib/libcrypto/x509/x509type.c
@@ -100,20 +100,26 @@ int X509_certificate_type(X509 *x, EVP_PKEY *pkey)
 		break;
 		}
 
-	i=X509_get_signature_type(x);
-	switch (i)
+	i=OBJ_obj2nid(x->sig_alg->algorithm);
+	if (i && OBJ_find_sigid_algs(i, NULL, &i))
 		{
-	case EVP_PKEY_RSA:
-		ret|=EVP_PKS_RSA;
-		break;
-	case EVP_PKEY_DSA:
-		ret|=EVP_PKS_DSA;
-		break;
-	case EVP_PKEY_EC:
-		ret|=EVP_PKS_EC;
-		break;
-	default:
-		break;
+
+		switch (i)
+			{
+		case NID_rsaEncryption:
+		case NID_rsa:
+			ret|=EVP_PKS_RSA;
+			break;
+		case NID_dsa:
+		case NID_dsa_2:
+			ret|=EVP_PKS_DSA;
+			break;
+		case NID_X9_62_id_ecPublicKey:
+			ret|=EVP_PKS_EC;
+			break;
+		default:
+			break;
+			}
 		}
 
 	if (EVP_PKEY_size(pk) <= 1024/8)/* /8 because it's 1024 bits we look
diff --git a/src/lib/libcrypto/x509/x_all.c b/src/lib/libcrypto/x509/x_all.c
index 8ec88c215a..b94aeeb873 100644
--- a/src/lib/libcrypto/x509/x_all.c
+++ b/src/lib/libcrypto/x509/x_all.c
@@ -95,12 +95,25 @@ int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md)
 		x->sig_alg, x->signature, x->cert_info,pkey,md));
 	}
 
+int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CINF),
+		x->cert_info->signature,
+		x->sig_alg, x->signature, x->cert_info, ctx);
+	}
+
 int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(X509_REQ_INFO),x->sig_alg, NULL,
 		x->signature, x->req_info,pkey,md));
 	}
 
+int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_REQ_INFO),
+		x->sig_alg, NULL, x->signature, x->req_info, ctx);
+	}
+
 int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	x->crl->enc.modified = 1;
@@ -108,6 +121,12 @@ int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md)
 		x->sig_alg, x->signature, x->crl,pkey,md));
 	}
 
+int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx)
+	{
+	return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CRL_INFO),
+		x->crl->sig_alg, x->sig_alg, x->signature, x->crl, ctx);
+	}
+
 int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md)
 	{
 	return(ASN1_item_sign(ASN1_ITEM_rptr(NETSCAPE_SPKAC), x->sig_algor,NULL,
diff --git a/src/lib/libcrypto/x509v3/v3_skey.c b/src/lib/libcrypto/x509v3/v3_skey.c
index 202c9e4896..0a984fbaa8 100644
--- a/src/lib/libcrypto/x509v3/v3_skey.c
+++ b/src/lib/libcrypto/x509v3/v3_skey.c
@@ -129,7 +129,8 @@ static ASN1_OCTET_STRING *s2i_skey_id(X509V3_EXT_METHOD *method,
 		goto err;
 	}
 
-	EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL);
+	if (!EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL))
+		goto err;
 
 	if(!M_ASN1_OCTET_STRING_set(oct, pkey_dig, diglen)) {
 		X509V3err(X509V3_F_S2I_SKEY_ID,ERR_R_MALLOC_FAILURE);
diff --git a/src/lib/libcrypto/x86_64cpuid.pl b/src/lib/libcrypto/x86_64cpuid.pl
index c96821a3c8..7b7b93b223 100644
--- a/src/lib/libcrypto/x86_64cpuid.pl
+++ b/src/lib/libcrypto/x86_64cpuid.pl
@@ -7,15 +7,24 @@ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output";
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") :	# Win64 order
+				 ("%rdi","%rsi","%rdx","%rcx");	# Unix order
 
-if ($win64)	{ $arg1="%rcx"; $arg2="%rdx"; }
-else		{ $arg1="%rdi"; $arg2="%rsi"; }
 print<<___;
 .extern		OPENSSL_cpuid_setup
+.hidden		OPENSSL_cpuid_setup
 .section	.init
 	call	OPENSSL_cpuid_setup
 
+.hidden	OPENSSL_ia32cap_P
+.comm	OPENSSL_ia32cap_P,8,4
+
 .text
 
 .globl	OPENSSL_atomic_add
@@ -46,7 +55,7 @@ OPENSSL_rdtsc:
 .type	OPENSSL_ia32_cpuid,\@abi-omnipotent
 .align	16
 OPENSSL_ia32_cpuid:
-	mov	%rbx,%r8
+	mov	%rbx,%r8		# save %rbx
 
 	xor	%eax,%eax
 	cpuid
@@ -78,7 +87,15 @@ OPENSSL_ia32_cpuid:
 	# AMD specific
 	mov	\$0x80000000,%eax
 	cpuid
-	cmp	\$0x80000008,%eax
+	cmp	\$0x80000001,%eax
+	jb	.Lintel
+	mov	%eax,%r10d
+	mov	\$0x80000001,%eax
+	cpuid
+	or	%ecx,%r9d
+	and	\$0x00000801,%r9d	# isolate AMD XOP bit, 1<<11
+
+	cmp	\$0x80000008,%r10d
 	jb	.Lintel
 
 	mov	\$0x80000008,%eax
@@ -89,12 +106,12 @@ OPENSSL_ia32_cpuid:
 	mov	\$1,%eax
 	cpuid
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	shr	\$16,%ebx		# number of logical processors
 	cmp	%r10b,%bl
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
-	jmp	.Ldone
+	jmp	.Lgeneric
 
 .Lintel:
 	cmp	\$4,%r11d
@@ -111,30 +128,47 @@ OPENSSL_ia32_cpuid:
 .Lnocacheinfo:
 	mov	\$1,%eax
 	cpuid
+	and	\$0xbfefffff,%edx	# force reserved bits to 0
 	cmp	\$0,%r9d
 	jne	.Lnotintel
-	or	\$0x00100000,%edx	# use reserved 20th bit to engage RC4_CHAR
+	or	\$0x40000000,%edx	# set reserved bit#30 on Intel CPUs
 	and	\$15,%ah
 	cmp	\$15,%ah		# examine Family ID
-	je	.Lnotintel
-	or	\$0x40000000,%edx	# use reserved bit to skip unrolled loop
+	jne	.Lnotintel
+	or	\$0x00100000,%edx	# set reserved bit#20 to engage RC4_CHAR
 .Lnotintel:
 	bt	\$28,%edx		# test hyper-threading bit
-	jnc	.Ldone
+	jnc	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
 	cmp	\$0,%r10d
-	je	.Ldone
+	je	.Lgeneric
 
 	or	\$0x10000000,%edx	# 1<<28
 	shr	\$16,%ebx
 	cmp	\$1,%bl			# see if cache is shared
-	ja	.Ldone
+	ja	.Lgeneric
 	and	\$0xefffffff,%edx	# ~(1<<28)
+.Lgeneric:
+	and	\$0x00000800,%r9d	# isolate AMD XOP flag
+	and	\$0xfffff7ff,%ecx
+	or	%ecx,%r9d		# merge AMD XOP flag
+
+	mov	%edx,%r10d		# %r9d:%r10d is copy of %ecx:%edx
+	bt	\$27,%r9d		# check OSXSAVE bit
+	jnc	.Lclear_avx
+	xor	%ecx,%ecx		# XCR0
+	.byte	0x0f,0x01,0xd0		# xgetbv
+	and	\$6,%eax		# isolate XMM and YMM state support
+	cmp	\$6,%eax
+	je	.Ldone
+.Lclear_avx:
+	mov	\$0xefffe7ff,%eax	# ~(1<<28|1<<12|1<<11)
+	and	%eax,%r9d		# clear AVX, FMA and AMD XOP bits
 .Ldone:
-	shl	\$32,%rcx
-	mov	%edx,%eax
-	mov	%r8,%rbx
-	or	%rcx,%rax
+	shl	\$32,%r9
+	mov	%r10d,%eax
+	mov	%r8,%rbx		# restore %rbx
+	or	%r9,%rax
 	ret
 .size	OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
 
@@ -229,4 +263,21 @@ OPENSSL_wipe_cpu:
 .size	OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 ___
 
+print<<___;
+.globl	OPENSSL_ia32_rdrand
+.type	OPENSSL_ia32_rdrand,\@abi-omnipotent
+.align	16
+OPENSSL_ia32_rdrand:
+	mov	\$8,%ecx
+.Loop_rdrand:
+	rdrand	%rax
+	jc	.Lbreak_rdrand
+	loop	.Loop_rdrand
+.Lbreak_rdrand:
+	cmp	\$0,%rax
+	cmove	%rcx,%rax
+	ret
+.size	OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand
+___
+
 close STDOUT;	# flush
diff --git a/src/lib/libcrypto/x86cpuid.pl b/src/lib/libcrypto/x86cpuid.pl
index a7464af19b..39fd8f2293 100644
--- a/src/lib/libcrypto/x86cpuid.pl
+++ b/src/lib/libcrypto/x86cpuid.pl
@@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&pushf	();
 	&pop	("eax");
 	&xor	("ecx","eax");
-	&bt	("ecx",21);
-	&jnc	(&label("done"));
 	&xor	("eax","eax");
+	&bt	("ecx",21);
+	&jnc	(&label("nocpuid"));
 	&cpuid	();
 	&mov	("edi","eax");		# max value for standard query level
 
@@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	# AMD specific
 	&mov	("eax",0x80000000);
 	&cpuid	();
-	&cmp	("eax",0x80000008);
+	&cmp	("eax",0x80000001);
+	&jb	(&label("intel"));
+	&mov	("esi","eax");
+	&mov	("eax",0x80000001);
+	&cpuid	();
+	&or	("ebp","ecx");
+	&and	("ebp",1<<11|1);	# isolate XOP bit
+	&cmp	("esi",0x80000008);
 	&jb	(&label("intel"));
 
 	&mov	("eax",0x80000008);
@@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&mov	("eax",1);
 	&cpuid	();
 	&bt	("edx",28);
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&shr	("ebx",16);
 	&and	("ebx",0xff);
 	&cmp	("ebx","esi");
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit
-	&jmp	(&label("done"));
+	&jmp	(&label("generic"));
 	
 &set_label("intel");
 	&cmp	("edi",4);
@@ -85,27 +92,51 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 &set_label("nocacheinfo");
 	&mov	("eax",1);
 	&cpuid	();
+	&and	("edx",0xbfefffff);	# force reserved bits #20, #30 to 0
 	&cmp	("ebp",0);
-	&jne	(&label("notP4"));
+	&jne	(&label("notintel"));
+	&or	("edx",1<<30);		# set reserved bit#30 on Intel CPUs
 	&and	(&HB("eax"),15);	# familiy ID
 	&cmp	(&HB("eax"),15);	# P4?
-	&jne	(&label("notP4"));
-	&or	("edx",1<<20);		# use reserved bit to engage RC4_CHAR
-&set_label("notP4");
+	&jne	(&label("notintel"));
+	&or	("edx",1<<20);		# set reserved bit#20 to engage RC4_CHAR
+&set_label("notintel");
 	&bt	("edx",28);		# test hyper-threading bit
-	&jnc	(&label("done"));
+	&jnc	(&label("generic"));
 	&and	("edx",0xefffffff);
 	&cmp	("edi",0);
-	&je	(&label("done"));
+	&je	(&label("generic"));
 
 	&or	("edx",0x10000000);
 	&shr	("ebx",16);
 	&cmp	(&LB("ebx"),1);
-	&ja	(&label("done"));
+	&ja	(&label("generic"));
 	&and	("edx",0xefffffff);	# clear hyper-threading bit if not
+
+&set_label("generic");
+	&and	("ebp",1<<11);		# isolate AMD XOP flag
+	&and	("ecx",0xfffff7ff);	# force 11th bit to 0
+	&mov	("esi","edx");
+	&or	("ebp","ecx");		# merge AMD XOP flag
+
+	&bt	("ecx",27);		# check OSXSAVE bit
+	&jnc	(&label("clear_avx"));
+	&xor	("ecx","ecx");
+	&data_byte(0x0f,0x01,0xd0);	# xgetbv
+	&and	("eax",6);
+	&cmp	("eax",6);
+	&je	(&label("done"));
+	&cmp	("eax",2);
+	&je	(&label("clear_avx"));
+&set_label("clear_xmm");
+	&and	("ebp",0xfdfffffd);	# clear AESNI and PCLMULQDQ bits
+	&and	("esi",0xfeffffff);	# clear FXSR
+&set_label("clear_avx");
+	&and	("ebp",0xefffe7ff);	# clear AVX, FMA and AMD XOP bits
 &set_label("done");
-	&mov	("eax","edx");
-	&mov	("edx","ecx");
+	&mov	("eax","esi");
+	&mov	("edx","ebp");
+&set_label("nocpuid");
 &function_end("OPENSSL_ia32_cpuid");
 
 &external_label("OPENSSL_ia32cap_P");
@@ -199,8 +230,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&bt	(&DWP(0,"ecx"),1);
 	&jnc	(&label("no_x87"));
 	if ($sse2) {
-		&bt	(&DWP(0,"ecx"),26);
-		&jnc	(&label("no_sse2"));
+		&and	("ecx",1<<26|1<<24);	# check SSE2 and FXSR bits
+		&cmp	("ecx",1<<26|1<<24);
+		&jne	(&label("no_sse2"));
 		&pxor	("xmm0","xmm0");
 		&pxor	("xmm1","xmm1");
 		&pxor	("xmm2","xmm2");
@@ -307,6 +339,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 	&ret	();
 &function_end_B("OPENSSL_cleanse");
 
+&function_begin_B("OPENSSL_ia32_rdrand");
+	&mov	("ecx",8);
+&set_label("loop");
+	&rdrand	("eax");
+	&jc	(&label("break"));
+	&loop	(&label("loop"));
+&set_label("break");
+	&cmp	("eax",0);
+	&cmove	("eax","ecx");
+	&ret	();
+&function_end_B("OPENSSL_ia32_rdrand");
+
 &initseg("OPENSSL_cpuid_setup");
 
 &asm_finish();
diff --git a/src/lib/libssl/bio_ssl.c b/src/lib/libssl/bio_ssl.c
index eedac8a3fc..e9552caee2 100644
--- a/src/lib/libssl/bio_ssl.c
+++ b/src/lib/libssl/bio_ssl.c
@@ -538,6 +538,7 @@ err:
 
 BIO *BIO_new_ssl_connect(SSL_CTX *ctx)
 	{
+#ifndef OPENSSL_NO_SOCK
 	BIO *ret=NULL,*con=NULL,*ssl=NULL;
 
 	if ((con=BIO_new(BIO_s_connect())) == NULL)
@@ -549,6 +550,7 @@ BIO *BIO_new_ssl_connect(SSL_CTX *ctx)
 	return(ret);
 err:
 	if (con != NULL) BIO_free(con);
+#endif
 	return(NULL);
 	}
 
diff --git a/src/lib/libssl/d1_both.c b/src/lib/libssl/d1_both.c
index 9f898d6997..de8bab873f 100644
--- a/src/lib/libssl/d1_both.c
+++ b/src/lib/libssl/d1_both.c
@@ -227,14 +227,14 @@ int dtls1_do_write(SSL *s, int type)
 	unsigned int len, frag_off, mac_size, blocksize;
 
 	/* AHA!  Figure out the MTU, and stick to the right size */
-	if ( ! (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU))
+	if (s->d1->mtu < dtls1_min_mtu() && !(SSL_get_options(s) & SSL_OP_NO_QUERY_MTU))
 		{
 		s->d1->mtu = 
 			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL);
 
 		/* I've seen the kernel return bogus numbers when it doesn't know
 		 * (initial write), so just make sure we have a reasonable number */
-		if ( s->d1->mtu < dtls1_min_mtu())
+		if (s->d1->mtu < dtls1_min_mtu())
 			{
 			s->d1->mtu = 0;
 			s->d1->mtu = dtls1_guess_mtu(s->d1->mtu);
@@ -1084,7 +1084,11 @@ int dtls1_read_failed(SSL *s, int code)
 		return code;
 		}
 
-	if ( ! SSL_in_init(s))  /* done, no need to send a retransmit */
+#ifndef OPENSSL_NO_HEARTBEATS
+	if (!SSL_in_init(s) && !s->tlsext_hb_pending)  /* done, no need to send a retransmit */
+#else
+	if (!SSL_in_init(s))  /* done, no need to send a retransmit */
+#endif
 		{
 		BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ);
 		return code;
@@ -1417,3 +1421,171 @@ dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr)
 
 	ccs_hdr->type = *(data++);
 	}
+
+int dtls1_shutdown(SSL *s)
+	{
+	int ret;
+#ifndef OPENSSL_NO_SCTP
+	if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+	    !(s->shutdown & SSL_SENT_SHUTDOWN))
+		{
+		ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+		if (ret < 0) return -1;
+
+		if (ret == 0)
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL);
+		}
+#endif
+	ret = ssl3_shutdown(s);
+#ifndef OPENSSL_NO_SCTP
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL);
+#endif
+	return ret;
+	}
+
+#ifndef OPENSSL_NO_HEARTBEATS
+int
+dtls1_process_heartbeat(SSL *s)
+	{
+	unsigned char *p = &s->s3->rrec.data[0], *pl;
+	unsigned short hbtype;
+	unsigned int payload;
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Read type and payload length first */
+	hbtype = *p++;
+	n2s(p, payload);
+	pl = p;
+
+	if (s->msg_callback)
+		s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT,
+			&s->s3->rrec.data[0], s->s3->rrec.length,
+			s, s->msg_callback_arg);
+
+	if (hbtype == TLS1_HB_REQUEST)
+		{
+		unsigned char *buffer, *bp;
+		int r;
+
+		/* Allocate memory for the response, size is 1 byte
+		 * message type, plus 2 bytes payload length, plus
+		 * payload, plus padding
+		 */
+		buffer = OPENSSL_malloc(1 + 2 + payload + padding);
+		bp = buffer;
+
+		/* Enter response type, length and copy payload */
+		*bp++ = TLS1_HB_RESPONSE;
+		s2n(payload, bp);
+		memcpy(bp, pl, payload);
+		bp += payload;
+		/* Random padding */
+		RAND_pseudo_bytes(bp, padding);
+
+		r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding);
+
+		if (r >= 0 && s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buffer, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		OPENSSL_free(buffer);
+
+		if (r < 0)
+			return r;
+		}
+	else if (hbtype == TLS1_HB_RESPONSE)
+		{
+		unsigned int seq;
+
+		/* We only send sequence numbers (2 bytes unsigned int),
+		 * and 16 random bytes, so we just try to read the
+		 * sequence number */
+		n2s(pl, seq);
+
+		if (payload == 18 && seq == s->tlsext_hb_seq)
+			{
+			dtls1_stop_timer(s);
+			s->tlsext_hb_seq++;
+			s->tlsext_hb_pending = 0;
+			}
+		}
+
+	return 0;
+	}
+
+int
+dtls1_heartbeat(SSL *s)
+	{
+	unsigned char *buf, *p;
+	int ret;
+	unsigned int payload = 18; /* Sequence number + random bytes */
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Only send if peer supports and accepts HB requests... */
+	if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) ||
+	    s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT);
+		return -1;
+		}
+
+	/* ...and there is none in flight yet... */
+	if (s->tlsext_hb_pending)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING);
+		return -1;
+		}
+
+	/* ...and no handshake in progress. */
+	if (SSL_in_init(s) || s->in_handshake)
+		{
+		SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE);
+		return -1;
+		}
+
+	/* Check if padding is too long, payload and padding
+	 * must not exceed 2^14 - 3 = 16381 bytes in total.
+	 */
+	OPENSSL_assert(payload + padding <= 16381);
+
+	/* Create HeartBeat message, we just use a sequence number
+	 * as payload to distuingish different messages and add
+	 * some random stuff.
+	 *  - Message Type, 1 byte
+	 *  - Payload Length, 2 bytes (unsigned int)
+	 *  - Payload, the sequence number (2 bytes uint)
+	 *  - Payload, random bytes (16 bytes uint)
+	 *  - Padding
+	 */
+	buf = OPENSSL_malloc(1 + 2 + payload + padding);
+	p = buf;
+	/* Message Type */
+	*p++ = TLS1_HB_REQUEST;
+	/* Payload length (18 bytes here) */
+	s2n(payload, p);
+	/* Sequence number */
+	s2n(s->tlsext_hb_seq, p);
+	/* 16 random bytes */
+	RAND_pseudo_bytes(p, 16);
+	p += 16;
+	/* Random padding */
+	RAND_pseudo_bytes(p, padding);
+
+	ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding);
+	if (ret >= 0)
+		{
+		if (s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buf, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		dtls1_start_timer(s);
+		s->tlsext_hb_pending = 1;
+		}
+
+	OPENSSL_free(buf);
+
+	return ret;
+	}
+#endif
diff --git a/src/lib/libssl/d1_clnt.c b/src/lib/libssl/d1_clnt.c
index 089fa4c7f8..a6ed09c51d 100644
--- a/src/lib/libssl/d1_clnt.c
+++ b/src/lib/libssl/d1_clnt.c
@@ -150,7 +150,11 @@ int dtls1_connect(SSL *s)
 	unsigned long Time=(unsigned long)time(NULL);
 	void (*cb)(const SSL *ssl,int type,int val)=NULL;
 	int ret= -1;
-	int new_state,state,skip=0;;
+	int new_state,state,skip=0;
+#ifndef OPENSSL_NO_SCTP
+	unsigned char sctpauthkey[64];
+	char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)];
+#endif
 
 	RAND_add(&Time,sizeof(Time),0);
 	ERR_clear_error();
@@ -164,6 +168,27 @@ int dtls1_connect(SSL *s)
 	s->in_handshake++;
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); 
 
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to enter handshake
+	 * mode and prevent stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		dtls1_stop_timer(s);
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -171,7 +196,7 @@ int dtls1_connect(SSL *s)
 		switch(s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			s->state=SSL_ST_CONNECT;
 			s->ctx->stats.sess_connect_renegotiate++;
 			/* break */
@@ -226,6 +251,42 @@ int dtls1_connect(SSL *s)
 			s->hit = 0;
 			break;
 
+#ifndef OPENSSL_NO_SCTP
+		case DTLS1_SCTP_ST_CR_READ_SOCK:
+
+			if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+			{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+			}
+
+			s->state=s->s3->tmp.next_state;
+			break;
+
+		case DTLS1_SCTP_ST_CW_WRITE_SOCK:
+			/* read app data until dry event */
+
+			ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+			if (ret < 0) goto end;
+
+			if (ret == 0)
+			{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+			}
+
+			s->state=s->d1->next_state;
+			break;
+#endif
+
 		case SSL3_ST_CW_CLNT_HELLO_A:
 		case SSL3_ST_CW_CLNT_HELLO_B:
 
@@ -248,9 +309,17 @@ int dtls1_connect(SSL *s)
 
 			s->init_num=0;
 
-			/* turn on buffering for the next lot of output */
-			if (s->bbio != s->wbio)
-				s->wbio=BIO_push(s->bbio,s->wbio);
+#ifndef OPENSSL_NO_SCTP
+			/* Disable buffering for SCTP */
+			if (!BIO_dgram_is_sctp(SSL_get_wbio(s)))
+				{
+#endif
+				/* turn on buffering for the next lot of output */
+				if (s->bbio != s->wbio)
+					s->wbio=BIO_push(s->bbio,s->wbio);
+#ifndef OPENSSL_NO_SCTP
+				}
+#endif
 
 			break;
 
@@ -260,9 +329,25 @@ int dtls1_connect(SSL *s)
 			if (ret <= 0) goto end;
 			else
 				{
-				dtls1_stop_timer(s);
 				if (s->hit)
+					{
+#ifndef OPENSSL_NO_SCTP
+					/* Add new shared key for SCTP-Auth,
+					 * will be ignored if no SCTP used.
+					 */
+					snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+					         DTLS1_SCTP_AUTH_LABEL);
+
+					SSL_export_keying_material(s, sctpauthkey,
+					                           sizeof(sctpauthkey), labelbuffer,
+					                           sizeof(labelbuffer), NULL, 0, 0);
+
+					BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+							 sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 					s->state=SSL3_ST_CR_FINISHED_A;
+					}
 				else
 					s->state=DTLS1_ST_CR_HELLO_VERIFY_REQUEST_A;
 				}
@@ -354,12 +439,20 @@ int dtls1_connect(SSL *s)
 		case SSL3_ST_CR_SRVR_DONE_B:
 			ret=ssl3_get_server_done(s);
 			if (ret <= 0) goto end;
+			dtls1_stop_timer(s);
 			if (s->s3->tmp.cert_req)
-				s->state=SSL3_ST_CW_CERT_A;
+				s->s3->tmp.next_state=SSL3_ST_CW_CERT_A;
 			else
-				s->state=SSL3_ST_CW_KEY_EXCH_A;
+				s->s3->tmp.next_state=SSL3_ST_CW_KEY_EXCH_A;
 			s->init_num=0;
 
+#ifndef OPENSSL_NO_SCTP			
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+			    state == SSL_ST_RENEGOTIATE)
+				s->state=DTLS1_SCTP_ST_CR_READ_SOCK;
+			else
+#endif			
+			s->state=s->s3->tmp.next_state;
 			break;
 
 		case SSL3_ST_CW_CERT_A:
@@ -378,6 +471,22 @@ int dtls1_connect(SSL *s)
 			dtls1_start_timer(s);
 			ret=dtls1_send_client_key_exchange(s);
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Add new shared key for SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+			         DTLS1_SCTP_AUTH_LABEL);
+
+			SSL_export_keying_material(s, sctpauthkey,
+			                           sizeof(sctpauthkey), labelbuffer,
+			                           sizeof(labelbuffer), NULL, 0, 0);
+
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+					 sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 			/* EAY EAY EAY need to check for DH fix cert
 			 * sent back */
 			/* For TLS, cert_req is set to 2, so a cert chain
@@ -388,7 +497,15 @@ int dtls1_connect(SSL *s)
 				}
 			else
 				{
-				s->state=SSL3_ST_CW_CHANGE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state=SSL3_ST_CW_CHANGE_A;
+					s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+					}
+				else
+#endif
+					s->state=SSL3_ST_CW_CHANGE_A;
 				s->s3->change_cipher_spec=0;
 				}
 
@@ -400,7 +517,15 @@ int dtls1_connect(SSL *s)
 			dtls1_start_timer(s);
 			ret=dtls1_send_client_verify(s);
 			if (ret <= 0) goto end;
-			s->state=SSL3_ST_CW_CHANGE_A;
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+			{
+				s->d1->next_state=SSL3_ST_CW_CHANGE_A;
+				s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+			}
+			else
+#endif
+				s->state=SSL3_ST_CW_CHANGE_A;
 			s->init_num=0;
 			s->s3->change_cipher_spec=0;
 			break;
@@ -412,6 +537,14 @@ int dtls1_connect(SSL *s)
 			ret=dtls1_send_change_cipher_spec(s,
 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Change to new shared key of SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL);
+#endif
+
 			s->state=SSL3_ST_CW_FINISHED_A;
 			s->init_num=0;
 
@@ -457,9 +590,23 @@ int dtls1_connect(SSL *s)
 			if (s->hit)
 				{
 				s->s3->tmp.next_state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+						s->d1->next_state = s->s3->tmp.next_state;
+						s->s3->tmp.next_state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+					}
+#endif
 				if (s->s3->flags & SSL3_FLAGS_DELAY_CLIENT_FINISHED)
 					{
 					s->state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+					if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+						{
+							s->d1->next_state = SSL_ST_OK;
+							s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+						}
+#endif
 					s->s3->flags|=SSL3_FLAGS_POP_BUFFER;
 					s->s3->delay_buf_pop_ret=0;
 					}
@@ -508,6 +655,16 @@ int dtls1_connect(SSL *s)
 				s->state=SSL3_ST_CW_CHANGE_A;
 			else
 				s->state=SSL_ST_OK;
+
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+				state == SSL_ST_RENEGOTIATE)
+				{
+				s->d1->next_state=s->state;
+				s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK;
+				}
+#endif
+
 			s->init_num=0;
 			break;
 
@@ -515,6 +672,13 @@ int dtls1_connect(SSL *s)
 			s->rwstate=SSL_WRITING;
 			if (BIO_flush(s->wbio) <= 0)
 				{
+				/* If the write error was fatal, stop trying */
+				if (!BIO_should_retry(s->wbio))
+					{
+					s->rwstate=SSL_NOTHING;
+					s->state=s->s3->tmp.next_state;
+					}
+				
 				ret= -1;
 				goto end;
 				}
@@ -541,6 +705,7 @@ int dtls1_connect(SSL *s)
 			/* else do it later in ssl3_write */
 
 			s->init_num=0;
+			s->renegotiate=0;
 			s->new_session=0;
 
 			ssl_update_cache(s,SSL_SESS_CACHE_CLIENT);
@@ -587,6 +752,15 @@ int dtls1_connect(SSL *s)
 		}
 end:
 	s->in_handshake--;
+	
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to leave handshake
+	 * mode and allow stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
 	if (buf != NULL)
 		BUF_MEM_free(buf);
 	if (cb != NULL)
diff --git a/src/lib/libssl/d1_enc.c b/src/lib/libssl/d1_enc.c
index becbab91c2..07a5e97ce5 100644
--- a/src/lib/libssl/d1_enc.c
+++ b/src/lib/libssl/d1_enc.c
@@ -260,7 +260,7 @@ int dtls1_enc(SSL *s, int send)
 				}
 			/* TLS 1.0 does not bound the number of padding bytes by the block size.
 			 * All of them must have value 'padding_length'. */
-			if (i > (int)rec->length)
+			if (i + bs > (int)rec->length)
 				{
 				/* Incorrect padding. SSLerr() and ssl3_alert are done
 				 * by caller: we don't want to reveal whether this is
diff --git a/src/lib/libssl/d1_lib.c b/src/lib/libssl/d1_lib.c
index c3b77c889b..f61f718183 100644
--- a/src/lib/libssl/d1_lib.c
+++ b/src/lib/libssl/d1_lib.c
@@ -82,6 +82,7 @@ SSL3_ENC_METHOD DTLSv1_enc_data={
 	TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE,
 	TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE,
 	tls1_alert_code,
+	tls1_export_keying_material,
 	};
 
 long dtls1_default_timeout(void)
@@ -291,6 +292,15 @@ const SSL_CIPHER *dtls1_get_cipher(unsigned int u)
 
 void dtls1_start_timer(SSL *s)
 	{
+#ifndef OPENSSL_NO_SCTP
+	/* Disable timer for SCTP */
+	if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+		{
+		memset(&(s->d1->next_timeout), 0, sizeof(struct timeval));
+		return;
+		}
+#endif
+
 	/* If timer is not set, initialize duration with 1 second */
 	if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0)
 		{
@@ -381,6 +391,7 @@ void dtls1_double_timeout(SSL *s)
 void dtls1_stop_timer(SSL *s)
 	{
 	/* Reset everything */
+	memset(&(s->d1->timeout), 0, sizeof(struct dtls1_timeout_st));
 	memset(&(s->d1->next_timeout), 0, sizeof(struct timeval));
 	s->d1->timeout_duration = 1;
 	BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout));
@@ -388,10 +399,28 @@ void dtls1_stop_timer(SSL *s)
 	dtls1_clear_record_buffer(s);
 	}
 
-int dtls1_handle_timeout(SSL *s)
+int dtls1_check_timeout_num(SSL *s)
 	{
-	DTLS1_STATE *state;
+	s->d1->timeout.num_alerts++;
+
+	/* Reduce MTU after 2 unsuccessful retransmissions */
+	if (s->d1->timeout.num_alerts > 2)
+		{
+		s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_GET_FALLBACK_MTU, 0, NULL);		
+		}
 
+	if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT)
+		{
+		/* fail the connection, enough alerts have been sent */
+		SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED);
+		return -1;
+		}
+
+	return 0;
+	}
+
+int dtls1_handle_timeout(SSL *s)
+	{
 	/* if no timer is expired, don't do anything */
 	if (!dtls1_is_timer_expired(s))
 		{
@@ -399,20 +428,23 @@ int dtls1_handle_timeout(SSL *s)
 		}
 
 	dtls1_double_timeout(s);
-	state = s->d1;
-	state->timeout.num_alerts++;
-	if ( state->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT)
-		{
-		/* fail the connection, enough alerts have been sent */
-		SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED);
+
+	if (dtls1_check_timeout_num(s) < 0)
 		return -1;
+
+	s->d1->timeout.read_timeouts++;
+	if (s->d1->timeout.read_timeouts > DTLS1_TMO_READ_COUNT)
+		{
+		s->d1->timeout.read_timeouts = 1;
 		}
 
-	state->timeout.read_timeouts++;
-	if ( state->timeout.read_timeouts > DTLS1_TMO_READ_COUNT)
+#ifndef OPENSSL_NO_HEARTBEATS
+	if (s->tlsext_hb_pending)
 		{
-		state->timeout.read_timeouts = 1;
+		s->tlsext_hb_pending = 0;
+		return dtls1_heartbeat(s);
 		}
+#endif
 
 	dtls1_start_timer(s);
 	return dtls1_retransmit_buffered_messages(s);
diff --git a/src/lib/libssl/d1_pkt.c b/src/lib/libssl/d1_pkt.c
index e0c0f0cc9a..987af60835 100644
--- a/src/lib/libssl/d1_pkt.c
+++ b/src/lib/libssl/d1_pkt.c
@@ -179,7 +179,6 @@ static int dtls1_record_needs_buffering(SSL *s, SSL3_RECORD *rr,
 static int dtls1_buffer_record(SSL *s, record_pqueue *q,
 	unsigned char *priority);
 static int dtls1_process_record(SSL *s);
-static void dtls1_clear_timeouts(SSL *s);
 
 /* copy buffered record into SSL structure */
 static int
@@ -232,6 +231,14 @@ dtls1_buffer_record(SSL *s, record_pqueue *queue, unsigned char *priority)
 
 	item->data = rdata;
 
+#ifndef OPENSSL_NO_SCTP
+	/* Store bio_dgram_sctp_rcvinfo struct */
+	if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+	    (s->state == SSL3_ST_SR_FINISHED_A || s->state == SSL3_ST_CR_FINISHED_A)) {
+		BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_GET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo);
+	}
+#endif
+
 	/* insert should not fail, since duplicates are dropped */
 	if (pqueue_insert(queue->q, item) == NULL)
 		{
@@ -376,6 +383,7 @@ dtls1_process_record(SSL *s)
 	unsigned int mac_size;
 	unsigned char md[EVP_MAX_MD_SIZE];
 	int decryption_failed_or_bad_record_mac = 0;
+	unsigned char *mac = NULL;
 
 
 	rr= &(s->s3->rrec);
@@ -447,19 +455,15 @@ printf("\n");
 #endif			
 			}
 		/* check the MAC for rr->input (it's in mac_size bytes at the tail) */
-		if (rr->length < mac_size)
+		if (rr->length >= mac_size)
 			{
-#if 0 /* OK only for stream ciphers */
-			al=SSL_AD_DECODE_ERROR;
-			SSLerr(SSL_F_DTLS1_PROCESS_RECORD,SSL_R_LENGTH_TOO_SHORT);
-			goto f_err;
-#else
-			decryption_failed_or_bad_record_mac = 1;
-#endif
+			rr->length -= mac_size;
+			mac = &rr->data[rr->length];
 			}
-		rr->length-=mac_size;
+		else
+			rr->length = 0;
 		i=s->method->ssl3_enc->mac(s,md,0);
-		if (i < 0 || memcmp(md,&(rr->data[rr->length]),mac_size) != 0)
+		if (i < 0 || mac == NULL || memcmp(md, mac, mac_size) != 0)
 			{
 			decryption_failed_or_bad_record_mac = 1;
 			}
@@ -644,20 +648,28 @@ again:
 		goto again;   /* get another record */
 		}
 
-	/* Check whether this is a repeat, or aged record.
-	 * Don't check if we're listening and this message is
-	 * a ClientHello. They can look as if they're replayed,
-	 * since they arrive from different connections and
-	 * would be dropped unnecessarily.
-	 */
-	if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE &&
-		*p == SSL3_MT_CLIENT_HELLO) &&
-		!dtls1_record_replay_check(s, bitmap))
-		{
-		rr->length = 0;
-		s->packet_length=0; /* dump this record */
-		goto again;     /* get another record */
-		}
+#ifndef OPENSSL_NO_SCTP
+	/* Only do replay check if no SCTP bio */
+	if (!BIO_dgram_is_sctp(SSL_get_rbio(s)))
+  		{
+#endif
+		/* Check whether this is a repeat, or aged record.
+		 * Don't check if we're listening and this message is
+		 * a ClientHello. They can look as if they're replayed,
+		 * since they arrive from different connections and
+		 * would be dropped unnecessarily.
+		 */
+		if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE &&
+		    *p == SSL3_MT_CLIENT_HELLO) &&
+		    !dtls1_record_replay_check(s, bitmap))
+			{
+			rr->length = 0;
+			s->packet_length=0; /* dump this record */
+			goto again;     /* get another record */
+			}
+#ifndef OPENSSL_NO_SCTP
+  		}
+#endif
 
 	/* just read a 0 length packet */
 	if (rr->length == 0) goto again;
@@ -685,7 +697,6 @@ again:
 		goto again;   /* get another record */
 		}
 
-	dtls1_clear_timeouts(s);  /* done waiting */
 	return(1);
 
 	}
@@ -743,7 +754,17 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek)
 
 	/* Now s->d1->handshake_fragment_len == 0 if type == SSL3_RT_HANDSHAKE. */
 
+#ifndef OPENSSL_NO_SCTP
+	/* Continue handshake if it had to be interrupted to read
+	 * app data with SCTP.
+	 */
+	if ((!s->in_handshake && SSL_in_init(s)) ||
+	    (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+	     (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK) &&
+	     s->s3->in_read_app_data != 2))
+#else
 	if (!s->in_handshake && SSL_in_init(s))
+#endif
 		{
 		/* type == SSL3_RT_APPLICATION_DATA */
 		i=s->handshake_func(s);
@@ -774,6 +795,15 @@ start:
 		item = pqueue_pop(s->d1->buffered_app_data.q);
 		if (item)
 			{
+#ifndef OPENSSL_NO_SCTP
+			/* Restore bio_dgram_sctp_rcvinfo struct */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)))
+				{
+				DTLS1_RECORD_DATA *rdata = (DTLS1_RECORD_DATA *) item->data;
+				BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_SET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo);
+				}
+#endif
+
 			dtls1_copy_record(s, item);
 
 			OPENSSL_free(item->data);
@@ -856,6 +886,31 @@ start:
 				rr->off=0;
 				}
 			}
+
+#ifndef OPENSSL_NO_SCTP
+			/* We were about to renegotiate but had to read
+			 * belated application data first, so retry.
+			 */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+			    rr->type == SSL3_RT_APPLICATION_DATA &&
+			    (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK))
+				{
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				}
+
+			/* We might had to delay a close_notify alert because
+			 * of reordered app data. If there was an alert and there
+			 * is no message to read anymore, finally set shutdown.
+			 */
+			if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+			    s->d1->shutdown_received && !BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+				{
+				s->shutdown |= SSL_RECEIVED_SHUTDOWN;
+				return(0);
+				}
+#endif			
 		return(n);
 		}
 
@@ -883,6 +938,19 @@ start:
 			dest = s->d1->alert_fragment;
 			dest_len = &s->d1->alert_fragment_len;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (rr->type == TLS1_RT_HEARTBEAT)
+			{
+			dtls1_process_heartbeat(s);
+
+			/* Exit and notify application to read again */
+			rr->length = 0;
+			s->rwstate=SSL_READING;
+			BIO_clear_retry_flags(SSL_get_rbio(s));
+			BIO_set_retry_read(SSL_get_rbio(s));
+			return(-1);
+			}
+#endif
 		/* else it's a CCS message, or application data or wrong */
 		else if (rr->type != SSL3_RT_CHANGE_CIPHER_SPEC)
 			{
@@ -966,6 +1034,7 @@ start:
 			!(s->s3->flags & SSL3_FLAGS_NO_RENEGOTIATE_CIPHERS) &&
 			!s->s3->renegotiate)
 			{
+			s->new_session = 1;
 			ssl3_renegotiate(s);
 			if (ssl3_renegotiate_check(s))
 				{
@@ -1027,6 +1096,21 @@ start:
 			s->s3->warn_alert = alert_descr;
 			if (alert_descr == SSL_AD_CLOSE_NOTIFY)
 				{
+#ifndef OPENSSL_NO_SCTP
+				/* With SCTP and streams the socket may deliver app data
+				 * after a close_notify alert. We have to check this
+				 * first so that nothing gets discarded.
+				 */
+				if (BIO_dgram_is_sctp(SSL_get_rbio(s)) &&
+					BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))
+					{
+					s->d1->shutdown_received = 1;
+					s->rwstate=SSL_READING;
+					BIO_clear_retry_flags(SSL_get_rbio(s));
+					BIO_set_retry_read(SSL_get_rbio(s));
+					return -1;
+					}
+#endif
 				s->shutdown |= SSL_RECEIVED_SHUTDOWN;
 				return(0);
 				}
@@ -1133,6 +1217,15 @@ start:
 		if (s->version == DTLS1_BAD_VER)
 			s->d1->handshake_read_seq++;
 
+#ifndef OPENSSL_NO_SCTP
+		/* Remember that a CCS has been received,
+		 * so that an old key of SCTP-Auth can be
+		 * deleted when a CCS is sent. Will be ignored
+		 * if no SCTP is used
+		 */
+		BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD, 1, NULL);
+#endif
+
 		goto start;
 		}
 
@@ -1155,6 +1248,9 @@ start:
 		 */
 		if (msg_hdr.type == SSL3_MT_FINISHED)
 			{
+			if (dtls1_check_timeout_num(s) < 0)
+				return -1;
+
 			dtls1_retransmit_buffered_messages(s);
 			rr->length = 0;
 			goto start;
@@ -1172,6 +1268,7 @@ start:
 #else
 			s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT;
 #endif
+			s->renegotiate=1;
 			s->new_session=1;
 			}
 		i=s->handshake_func(s);
@@ -1268,7 +1365,16 @@ dtls1_write_app_data_bytes(SSL *s, int type, const void *buf_, int len)
 	{
 	int i;
 
-	if (SSL_in_init(s) && !s->in_handshake)
+#ifndef OPENSSL_NO_SCTP
+		/* Check if we have to continue an interrupted handshake
+		 * for reading belated app data with SCTP.
+		 */
+		if ((SSL_in_init(s) && !s->in_handshake) ||
+		    (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+		     (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK)))
+#else
+		if (SSL_in_init(s) && !s->in_handshake)
+#endif
 		{
 		i=s->handshake_func(s);
 		if (i < 0) return(i);
@@ -1768,10 +1874,3 @@ dtls1_reset_seq_numbers(SSL *s, int rw)
 
 	memset(seq, 0x00, seq_bytes);
 	}
-
-
-static void
-dtls1_clear_timeouts(SSL *s)
-	{
-	memset(&(s->d1->timeout), 0x00, sizeof(struct dtls1_timeout_st));
-	}
diff --git a/src/lib/libssl/d1_srtp.c b/src/lib/libssl/d1_srtp.c
new file mode 100644
index 0000000000..928935bd8b
--- /dev/null
+++ b/src/lib/libssl/d1_srtp.c
@@ -0,0 +1,493 @@
+/* ssl/t1_lib.c */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <ekr@rtfm.com>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#ifndef OPENSSL_NO_SRTP
+
+#include <stdio.h>
+#include <openssl/objects.h>
+#include "ssl_locl.h"
+#include "srtp.h"
+
+
+static SRTP_PROTECTION_PROFILE srtp_known_profiles[]=
+    {
+    {
+    "SRTP_AES128_CM_SHA1_80",
+    SRTP_AES128_CM_SHA1_80,
+    },
+    {
+    "SRTP_AES128_CM_SHA1_32",
+    SRTP_AES128_CM_SHA1_32,
+    },
+#if 0
+    {
+    "SRTP_NULL_SHA1_80",
+    SRTP_NULL_SHA1_80,
+    },
+    {
+    "SRTP_NULL_SHA1_32",
+    SRTP_NULL_SHA1_32,
+    },
+#endif
+    {0}
+    };
+
+static int find_profile_by_name(char *profile_name,
+				SRTP_PROTECTION_PROFILE **pptr,unsigned len)
+	{
+	SRTP_PROTECTION_PROFILE *p;
+
+	p=srtp_known_profiles;
+	while(p->name)
+		{
+		if((len == strlen(p->name)) && !strncmp(p->name,profile_name,
+							len))
+			{
+			*pptr=p;
+			return 0;
+			}
+
+		p++;
+		}
+
+	return 1;
+	}
+
+static int find_profile_by_num(unsigned profile_num,
+			       SRTP_PROTECTION_PROFILE **pptr)
+	{
+	SRTP_PROTECTION_PROFILE *p;
+
+	p=srtp_known_profiles;
+	while(p->name)
+		{
+		if(p->id == profile_num)
+			{
+			*pptr=p;
+			return 0;
+			}
+		p++;
+		}
+
+	return 1;
+	}
+
+static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out)
+	{
+	STACK_OF(SRTP_PROTECTION_PROFILE) *profiles;
+
+	char *col;
+	char *ptr=(char *)profiles_string;
+    
+	SRTP_PROTECTION_PROFILE *p;
+
+	if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null()))
+		{
+		SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES);
+		return 1;
+		}
+    
+	do
+		{
+		col=strchr(ptr,':');
+
+		if(!find_profile_by_name(ptr,&p,
+					 col ? col-ptr : (int)strlen(ptr)))
+			{
+			sk_SRTP_PROTECTION_PROFILE_push(profiles,p);
+			}
+		else
+			{
+			SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE);
+			return 1;
+			}
+
+		if(col) ptr=col+1;
+		} while (col);
+
+	*out=profiles;
+    
+	return 0;
+	}
+    
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles)
+	{
+	return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles);
+	}
+
+int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles)
+	{
+	return ssl_ctx_make_profiles(profiles,&s->srtp_profiles);
+	}
+
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s)
+	{
+	if(s != NULL)
+		{
+		if(s->srtp_profiles != NULL)
+			{
+			return s->srtp_profiles;
+			}
+		else if((s->ctx != NULL) &&
+			(s->ctx->srtp_profiles != NULL))
+			{
+			return s->ctx->srtp_profiles;
+			}
+		}
+
+	return NULL;
+	}
+
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s)
+	{
+	return s->srtp_profile;
+	}
+
+/* Note: this function returns 0 length if there are no 
+   profiles specified */
+int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+	{
+	int ct=0;
+	int i;
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0;
+	SRTP_PROTECTION_PROFILE *prof;
+    
+	clnt=SSL_get_srtp_profiles(s);    
+	ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */
+
+	if(p)
+		{
+		if(ct==0)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST);
+			return 1;
+			}
+
+		if((2 + ct*2 + 1) > maxlen)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+			return 1;
+			}
+
+                /* Add the length */
+                s2n(ct * 2, p);
+		for(i=0;i<ct;i++)
+			{
+			prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+			s2n(prof->id,p);
+			}
+
+                /* Add an empty use_mki value */
+                *p++ = 0;
+		}
+
+	*len=2 + ct*2 + 1;
+    
+	return 0;
+	}
+
+
+int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+	{
+	SRTP_PROTECTION_PROFILE *cprof,*sprof;
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr;
+        int ct;
+        int mki_len;
+	int i,j;
+	int id;
+	int ret;
+
+         /* Length value + the MKI length */
+        if(len < 3)
+		{            
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+                }
+
+        /* Pull off the length of the cipher suite list */
+        n2s(d, ct);
+        len -= 2;
+        
+        /* Check that it is even */
+	if(ct%2)
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+        
+        /* Check that lengths are consistent */
+	if(len < (ct + 1)) 
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+        
+	clnt=sk_SRTP_PROTECTION_PROFILE_new_null();
+
+	while(ct)
+		{
+		n2s(d,id);
+		ct-=2;
+                len-=2;
+
+		if(!find_profile_by_num(id,&cprof))
+			{
+			sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof);
+			}
+		else
+			{
+			; /* Ignore */
+			}
+		}
+
+        /* Now extract the MKI value as a sanity check, but discard it for now */
+        mki_len = *d;
+        d++; len--;
+
+        if (mki_len != len)
+		{
+		SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+	srvr=SSL_get_srtp_profiles(s);
+
+	/* Pick our most preferred profile. If no profiles have been
+	 configured then the outer loop doesn't run 
+	 (sk_SRTP_PROTECTION_PROFILE_num() = -1)
+	 and so we just return without doing anything */
+	for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(srvr);i++)
+		{
+		sprof=sk_SRTP_PROTECTION_PROFILE_value(srvr,i);
+
+		for(j=0;j<sk_SRTP_PROTECTION_PROFILE_num(clnt);j++)
+			{
+			cprof=sk_SRTP_PROTECTION_PROFILE_value(clnt,j);
+            
+			if(cprof->id==sprof->id)
+				{
+				s->srtp_profile=sprof;
+				*al=0;
+				ret=0;
+				goto done;
+				}
+			}
+		}
+
+	ret=0;
+    
+done:
+	if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt);
+
+	return ret;
+	}
+
+int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen)
+	{
+	if(p)
+		{
+		if(maxlen < 5)
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG);
+			return 1;
+			}
+
+		if(s->srtp_profile==0)
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED);
+			return 1;
+			}
+                s2n(2, p);
+		s2n(s->srtp_profile->id,p);
+                *p++ = 0;
+		}
+	*len=5;
+    
+	return 0;
+	}
+    
+
+int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al)
+	{
+	unsigned id;
+	int i;
+        int ct;
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *clnt;
+	SRTP_PROTECTION_PROFILE *prof;
+
+	if(len!=5)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+        n2s(d, ct);
+	if(ct!=2)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+
+	n2s(d,id);
+        if (*d)  /* Must be no MKI, since we never offer one */
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE);
+		*al=SSL_AD_ILLEGAL_PARAMETER;
+		return 1;
+		}
+
+	clnt=SSL_get_srtp_profiles(s);
+
+	/* Throw an error if the server gave us an unsolicited extension */
+	if (clnt == NULL)
+		{
+		SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES);
+		*al=SSL_AD_DECODE_ERROR;
+		return 1;
+		}
+    
+	/* Check to see if the server gave us something we support
+	   (and presumably offered)
+	*/
+	for(i=0;i<sk_SRTP_PROTECTION_PROFILE_num(clnt);i++)
+		{
+		prof=sk_SRTP_PROTECTION_PROFILE_value(clnt,i);
+	    
+		if(prof->id == id)
+			{
+			s->srtp_profile=prof;
+			*al=0;
+			return 0;
+			}
+		}
+
+	SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST);
+	*al=SSL_AD_DECODE_ERROR;
+	return 1;
+	}
+
+
+#endif
diff --git a/src/lib/libssl/d1_srvr.c b/src/lib/libssl/d1_srvr.c
index 149983be30..29421da9aa 100644
--- a/src/lib/libssl/d1_srvr.c
+++ b/src/lib/libssl/d1_srvr.c
@@ -151,6 +151,10 @@ int dtls1_accept(SSL *s)
 	int ret= -1;
 	int new_state,state,skip=0;
 	int listen;
+#ifndef OPENSSL_NO_SCTP
+	unsigned char sctpauthkey[64];
+	char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)];
+#endif
 
 	RAND_add(&Time,sizeof(Time),0);
 	ERR_clear_error();
@@ -168,6 +172,13 @@ int dtls1_accept(SSL *s)
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s);
 
 	s->d1->listen = listen;
+#ifndef OPENSSL_NO_SCTP
+	/* Notify SCTP BIO socket to enter handshake
+	 * mode and prevent stream identifier other
+	 * than 0. Will be ignored if no SCTP is used.
+	 */
+	BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
 
 	if (s->cert == NULL)
 		{
@@ -175,6 +186,19 @@ int dtls1_accept(SSL *s)
 		return(-1);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		dtls1_stop_timer(s);
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -182,7 +206,7 @@ int dtls1_accept(SSL *s)
 		switch (s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			/* s->state=SSL_ST_ACCEPT; */
 
 		case SSL_ST_BEFORE:
@@ -227,8 +251,12 @@ int dtls1_accept(SSL *s)
 				{
 				/* Ok, we now need to push on a buffering BIO so that
 				 * the output is sent in a way that TCP likes :-)
+				 * ...but not with SCTP :-)
 				 */
-				if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; }
+#ifndef OPENSSL_NO_SCTP
+				if (!BIO_dgram_is_sctp(SSL_get_wbio(s)))
+#endif
+					if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; }
 
 				ssl3_init_finished_mac(s);
 				s->state=SSL3_ST_SR_CLNT_HELLO_A;
@@ -313,25 +341,75 @@ int dtls1_accept(SSL *s)
 				ssl3_init_finished_mac(s);
 			break;
 			
+#ifndef OPENSSL_NO_SCTP
+		case DTLS1_SCTP_ST_SR_READ_SOCK:
+			
+			if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s)))		
+				{
+				s->s3->in_read_app_data=2;
+				s->rwstate=SSL_READING;
+				BIO_clear_retry_flags(SSL_get_rbio(s));
+				BIO_set_retry_read(SSL_get_rbio(s));
+				ret = -1;
+				goto end;
+				}
+			
+			s->state=SSL3_ST_SR_FINISHED_A;
+			break;
+			
+		case DTLS1_SCTP_ST_SW_WRITE_SOCK:
+			ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s));
+			if (ret < 0) goto end;
+			
+			if (ret == 0)
+				{
+				if (s->d1->next_state != SSL_ST_OK)
+					{
+					s->s3->in_read_app_data=2;
+					s->rwstate=SSL_READING;
+					BIO_clear_retry_flags(SSL_get_rbio(s));
+					BIO_set_retry_read(SSL_get_rbio(s));
+					ret = -1;
+					goto end;
+					}
+				}
+
+			s->state=s->d1->next_state;
+			break;
+#endif
+
 		case SSL3_ST_SW_SRVR_HELLO_A:
 		case SSL3_ST_SW_SRVR_HELLO_B:
-			s->new_session = 2;
+			s->renegotiate = 2;
 			dtls1_start_timer(s);
 			ret=dtls1_send_server_hello(s);
 			if (ret <= 0) goto end;
 
-#ifndef OPENSSL_NO_TLSEXT
 			if (s->hit)
 				{
+#ifndef OPENSSL_NO_SCTP
+				/* Add new shared key for SCTP-Auth,
+				 * will be ignored if no SCTP used.
+				 */
+				snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+				         DTLS1_SCTP_AUTH_LABEL);
+
+				SSL_export_keying_material(s, sctpauthkey,
+				                           sizeof(sctpauthkey), labelbuffer,
+				                           sizeof(labelbuffer), NULL, 0, 0);
+				
+				BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+                         sizeof(sctpauthkey), sctpauthkey);
+#endif
+#ifndef OPENSSL_NO_TLSEXT
 				if (s->tlsext_ticket_expected)
 					s->state=SSL3_ST_SW_SESSION_TICKET_A;
 				else
 					s->state=SSL3_ST_SW_CHANGE_A;
-				}
 #else
-			if (s->hit)
-					s->state=SSL3_ST_SW_CHANGE_A;
+				s->state=SSL3_ST_SW_CHANGE_A;
 #endif
+				}
 			else
 				s->state=SSL3_ST_SW_CERT_A;
 			s->init_num=0;
@@ -441,6 +519,13 @@ int dtls1_accept(SSL *s)
 				skip=1;
 				s->s3->tmp.cert_request=0;
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A;
+					s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 				}
 			else
 				{
@@ -450,9 +535,23 @@ int dtls1_accept(SSL *s)
 				if (ret <= 0) goto end;
 #ifndef NETSCAPE_HANG_BUG
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A;
+					s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 #else
 				s->state=SSL3_ST_SW_FLUSH;
 				s->s3->tmp.next_state=SSL3_ST_SR_CERT_A;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = s->s3->tmp.next_state;
+					s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
 #endif
 				s->init_num=0;
 				}
@@ -472,6 +571,13 @@ int dtls1_accept(SSL *s)
 			s->rwstate=SSL_WRITING;
 			if (BIO_flush(s->wbio) <= 0)
 				{
+				/* If the write error was fatal, stop trying */
+				if (!BIO_should_retry(s->wbio))
+					{
+					s->rwstate=SSL_NOTHING;
+					s->state=s->s3->tmp.next_state;
+					}
+				
 				ret= -1;
 				goto end;
 				}
@@ -485,15 +591,16 @@ int dtls1_accept(SSL *s)
 			ret = ssl3_check_client_hello(s);
 			if (ret <= 0)
 				goto end;
-			dtls1_stop_timer(s);
 			if (ret == 2)
+				{
+				dtls1_stop_timer(s);
 				s->state = SSL3_ST_SR_CLNT_HELLO_C;
+				}
 			else {
 				/* could be sent for a DH cert, even if we
 				 * have not asked for it :-) */
 				ret=ssl3_get_client_certificate(s);
 				if (ret <= 0) goto end;
-				dtls1_stop_timer(s);
 				s->init_num=0;
 				s->state=SSL3_ST_SR_KEY_EXCH_A;
 			}
@@ -503,7 +610,21 @@ int dtls1_accept(SSL *s)
 		case SSL3_ST_SR_KEY_EXCH_B:
 			ret=ssl3_get_client_key_exchange(s);
 			if (ret <= 0) goto end;
-			dtls1_stop_timer(s);
+#ifndef OPENSSL_NO_SCTP
+			/* Add new shared key for SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			snprintf((char *) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL),
+			         DTLS1_SCTP_AUTH_LABEL);
+
+			SSL_export_keying_material(s, sctpauthkey,
+			                           sizeof(sctpauthkey), labelbuffer,
+			                           sizeof(labelbuffer), NULL, 0, 0);
+
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY,
+			         sizeof(sctpauthkey), sctpauthkey);
+#endif
+
 			s->state=SSL3_ST_SR_CERT_VRFY_A;
 			s->init_num=0;
 
@@ -540,9 +661,13 @@ int dtls1_accept(SSL *s)
 			/* we should decide if we expected this one */
 			ret=ssl3_get_cert_verify(s);
 			if (ret <= 0) goto end;
-			dtls1_stop_timer(s);
-
-			s->state=SSL3_ST_SR_FINISHED_A;
+#ifndef OPENSSL_NO_SCTP
+			if (BIO_dgram_is_sctp(SSL_get_wbio(s)) &&
+			    state == SSL_ST_RENEGOTIATE)
+				s->state=DTLS1_SCTP_ST_SR_READ_SOCK;
+			else
+#endif			
+				s->state=SSL3_ST_SR_FINISHED_A;
 			s->init_num=0;
 			break;
 
@@ -594,6 +719,14 @@ int dtls1_accept(SSL *s)
 				SSL3_ST_SW_CHANGE_A,SSL3_ST_SW_CHANGE_B);
 
 			if (ret <= 0) goto end;
+
+#ifndef OPENSSL_NO_SCTP
+			/* Change to new shared key of SCTP-Auth,
+			 * will be ignored if no SCTP used.
+			 */
+			BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL);
+#endif
+
 			s->state=SSL3_ST_SW_FINISHED_A;
 			s->init_num=0;
 
@@ -618,7 +751,16 @@ int dtls1_accept(SSL *s)
 			if (s->hit)
 				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
 			else
+				{
 				s->s3->tmp.next_state=SSL_ST_OK;
+#ifndef OPENSSL_NO_SCTP
+				if (BIO_dgram_is_sctp(SSL_get_wbio(s)))
+					{
+					s->d1->next_state = s->s3->tmp.next_state;
+					s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK;
+					}
+#endif
+				}
 			s->init_num=0;
 			break;
 
@@ -636,11 +778,9 @@ int dtls1_accept(SSL *s)
 
 			s->init_num=0;
 
-			if (s->new_session == 2) /* skipped if we just sent a HelloRequest */
+			if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */
 				{
-				/* actually not necessarily a 'new' session unless
-				 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
-				
+				s->renegotiate=0;
 				s->new_session=0;
 				
 				ssl_update_cache(s,SSL_SESS_CACHE_SERVER);
@@ -692,6 +832,14 @@ end:
 	/* BIO_flush(s->wbio); */
 
 	s->in_handshake--;
+#ifndef OPENSSL_NO_SCTP
+		/* Notify SCTP BIO socket to leave handshake
+		 * mode and prevent stream identifier other
+		 * than 0. Will be ignored if no SCTP is used.
+		 */
+		BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL);
+#endif
+
 	if (cb != NULL)
 		cb(s,SSL_CB_ACCEPT_EXIT,ret);
 	return(ret);
@@ -772,7 +920,7 @@ int dtls1_send_server_hello(SSL *s)
 		p=s->s3->server_random;
 		Time=(unsigned long)time(NULL);			/* Time */
 		l2n(Time,p);
-		RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-sizeof(Time));
+		RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-4);
 		/* Do the message type and length last */
 		d=p= &(buf[DTLS1_HM_HEADER_LENGTH]);
 
@@ -1147,7 +1295,7 @@ int dtls1_send_server_key_exchange(SSL *s)
 		if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 			&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
 			{
-			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher))
+			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher, NULL))
 				== NULL)
 				{
 				al=SSL_AD_DECODE_ERROR;
diff --git a/src/lib/libssl/dtls1.h b/src/lib/libssl/dtls1.h
index 2900d1d8ae..5008bf6081 100644
--- a/src/lib/libssl/dtls1.h
+++ b/src/lib/libssl/dtls1.h
@@ -105,6 +105,11 @@ extern "C" {
 #define DTLS1_AL_HEADER_LENGTH                   2
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_AUTH_LABEL	"EXPORTER_DTLS_OVER_SCTP"
+#endif
 
 typedef struct dtls1_bitmap_st
 	{
@@ -227,7 +232,7 @@ typedef struct dtls1_state_st
 
 	struct dtls1_timeout_st timeout;
 
-	/* Indicates when the last handshake msg sent will timeout */
+	/* Indicates when the last handshake msg or heartbeat sent will timeout */
 	struct timeval next_timeout;
 
 	/* Timeout duration */
@@ -243,6 +248,13 @@ typedef struct dtls1_state_st
 	unsigned int retransmitting;
 	unsigned int change_cipher_spec_ok;
 
+#ifndef OPENSSL_NO_SCTP
+	/* used when SSL_ST_XX_FLUSH is entered */
+	int next_state;
+
+	int shutdown_received;
+#endif
+
 	} DTLS1_STATE;
 
 typedef struct dtls1_record_data_st
@@ -251,8 +263,12 @@ typedef struct dtls1_record_data_st
 	unsigned int   packet_length;
 	SSL3_BUFFER    rbuf;
 	SSL3_RECORD    rrec;
+#ifndef OPENSSL_NO_SCTP
+	struct bio_dgram_sctp_rcvinfo recordinfo;
+#endif
 	} DTLS1_RECORD_DATA;
 
+#endif
 
 /* Timeout multipliers (timeout slice is defined in apps/timeouts.h */
 #define DTLS1_TMO_READ_COUNT                      2
diff --git a/src/lib/libssl/s23_clnt.c b/src/lib/libssl/s23_clnt.c
index c4d8bf2eb3..47673e740a 100644
--- a/src/lib/libssl/s23_clnt.c
+++ b/src/lib/libssl/s23_clnt.c
@@ -129,6 +129,10 @@ static const SSL_METHOD *ssl23_get_client_method(int ver)
 		return(SSLv3_client_method());
 	else if (ver == TLS1_VERSION)
 		return(TLSv1_client_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_client_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_client_method());
 	else
 		return(NULL);
 	}
@@ -278,24 +282,51 @@ static int ssl23_client_hello(SSL *s)
 	SSL_COMP *comp;
 #endif
 	int ret;
+	unsigned long mask, options = s->options;
 
-	ssl2_compat = (s->options & SSL_OP_NO_SSLv2) ? 0 : 1;
+	ssl2_compat = (options & SSL_OP_NO_SSLv2) ? 0 : 1;
 
 	if (ssl2_compat && ssl23_no_ssl2_ciphers(s))
 		ssl2_compat = 0;
 
-	if (!(s->options & SSL_OP_NO_TLSv1))
-		{
+	/*
+	 * SSL_OP_NO_X disables all protocols above X *if* there are
+	 * some protocols below X enabled. This is required in order
+	 * to maintain "version capability" vector contiguous. So
+	 * that if application wants to disable TLS1.0 in favour of
+	 * TLS1>=1, it would be insufficient to pass SSL_NO_TLSv1, the
+	 * answer is SSL_OP_NO_TLSv1|SSL_OP_NO_SSLv3|SSL_OP_NO_SSLv2.
+	 */
+	mask =	SSL_OP_NO_TLSv1_1|SSL_OP_NO_TLSv1
+#if !defined(OPENSSL_NO_SSL3)
+		|SSL_OP_NO_SSLv3
+#endif
+#if !defined(OPENSSL_NO_SSL2)
+		|(ssl2_compat?SSL_OP_NO_SSLv2:0)
+#endif
+		;
+#if !defined(OPENSSL_NO_TLS1_2_CLIENT)
+	version = TLS1_2_VERSION;
+
+	if ((options & SSL_OP_NO_TLSv1_2) && (options & mask) != mask)
+		version = TLS1_1_VERSION;
+#else
+	version = TLS1_1_VERSION;
+#endif
+	mask &= ~SSL_OP_NO_TLSv1_1;
+	if ((options & SSL_OP_NO_TLSv1_1) && (options & mask) != mask)
 		version = TLS1_VERSION;
-		}
-	else if (!(s->options & SSL_OP_NO_SSLv3))
-		{
+	mask &= ~SSL_OP_NO_TLSv1;
+#if !defined(OPENSSL_NO_SSL3)
+	if ((options & SSL_OP_NO_TLSv1) && (options & mask) != mask)
 		version = SSL3_VERSION;
-		}
-	else if (!(s->options & SSL_OP_NO_SSLv2))
-		{
+	mask &= ~SSL_OP_NO_SSLv3;
+#endif
+#if !defined(OPENSSL_NO_SSL2)
+	if ((options & SSL_OP_NO_SSLv3) && (options & mask) != mask)
 		version = SSL2_VERSION;
-		}
+#endif
+
 #ifndef OPENSSL_NO_TLSEXT
 	if (version != SSL2_VERSION)
 		{
@@ -329,11 +360,29 @@ static int ssl23_client_hello(SSL *s)
 		if (RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-4) <= 0)
 			return -1;
 
-		if (version == TLS1_VERSION)
+		if (version == TLS1_2_VERSION)
+			{
+			version_major = TLS1_2_VERSION_MAJOR;
+			version_minor = TLS1_2_VERSION_MINOR;
+			}
+		else if (version == TLS1_1_VERSION)
+			{
+			version_major = TLS1_1_VERSION_MAJOR;
+			version_minor = TLS1_1_VERSION_MINOR;
+			}
+		else if (version == TLS1_VERSION)
 			{
 			version_major = TLS1_VERSION_MAJOR;
 			version_minor = TLS1_VERSION_MINOR;
 			}
+#ifdef OPENSSL_FIPS
+		else if(FIPS_mode())
+			{
+			SSLerr(SSL_F_SSL23_CLIENT_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+			return -1;
+			}
+#endif
 		else if (version == SSL3_VERSION)
 			{
 			version_major = SSL3_VERSION_MAJOR;
@@ -437,6 +486,15 @@ static int ssl23_client_hello(SSL *s)
 				SSLerr(SSL_F_SSL23_CLIENT_HELLO,SSL_R_NO_CIPHERS_AVAILABLE);
 				return -1;
 				}
+#ifdef OPENSSL_MAX_TLS1_2_CIPHER_LENGTH
+			/* Some servers hang if client hello > 256 bytes
+			 * as hack workaround chop number of supported ciphers
+			 * to keep it well below this if we use TLS v1.2
+			 */
+			if (TLS1_get_version(s) >= TLS1_2_VERSION
+				&& i > OPENSSL_MAX_TLS1_2_CIPHER_LENGTH)
+				i = OPENSSL_MAX_TLS1_2_CIPHER_LENGTH & ~1;
+#endif
 			s2n(i,p);
 			p+=i;
 
@@ -491,8 +549,13 @@ static int ssl23_client_hello(SSL *s)
 			d=buf;
 			*(d++) = SSL3_RT_HANDSHAKE;
 			*(d++) = version_major;
-			*(d++) = version_minor; /* arguably we should send the *lowest* suported version here
-			                         * (indicating, e.g., TLS 1.0 in "SSL 3.0 format") */
+			/* Some servers hang if we use long client hellos
+			 * and a record number > TLS 1.0.
+			 */
+			if (TLS1_get_client_version(s) > TLS1_VERSION)
+				*(d++) = 1;
+			else
+				*(d++) = version_minor;
 			s2n((int)l,d);
 
 			/* number of bytes to write */
@@ -608,7 +671,7 @@ static int ssl23_get_server_hello(SSL *s)
 #endif
 		}
 	else if (p[1] == SSL3_VERSION_MAJOR &&
-	         (p[2] == SSL3_VERSION_MINOR || p[2] == TLS1_VERSION_MINOR) &&
+	         p[2] <= TLS1_2_VERSION_MINOR &&
 	         ((p[0] == SSL3_RT_HANDSHAKE && p[5] == SSL3_MT_SERVER_HELLO) ||
 	          (p[0] == SSL3_RT_ALERT && p[3] == 0 && p[4] == 2)))
 		{
@@ -617,6 +680,14 @@ static int ssl23_get_server_hello(SSL *s)
 		if ((p[2] == SSL3_VERSION_MINOR) &&
 			!(s->options & SSL_OP_NO_SSLv3))
 			{
+#ifdef OPENSSL_FIPS
+			if(FIPS_mode())
+				{
+				SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+				goto err;
+				}
+#endif
 			s->version=SSL3_VERSION;
 			s->method=SSLv3_client_method();
 			}
@@ -626,6 +697,18 @@ static int ssl23_get_server_hello(SSL *s)
 			s->version=TLS1_VERSION;
 			s->method=TLSv1_client_method();
 			}
+		else if ((p[2] == TLS1_1_VERSION_MINOR) &&
+			!(s->options & SSL_OP_NO_TLSv1_1))
+			{
+			s->version=TLS1_1_VERSION;
+			s->method=TLSv1_1_client_method();
+			}
+		else if ((p[2] == TLS1_2_VERSION_MINOR) &&
+			!(s->options & SSL_OP_NO_TLSv1_2))
+			{
+			s->version=TLS1_2_VERSION;
+			s->method=TLSv1_2_client_method();
+			}
 		else
 			{
 			SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,SSL_R_UNSUPPORTED_PROTOCOL);
diff --git a/src/lib/libssl/s23_srvr.c b/src/lib/libssl/s23_srvr.c
index 836dd1f1cf..4877849013 100644
--- a/src/lib/libssl/s23_srvr.c
+++ b/src/lib/libssl/s23_srvr.c
@@ -115,6 +115,9 @@
 #include <openssl/rand.h>
 #include <openssl/objects.h>
 #include <openssl/evp.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 
 static const SSL_METHOD *ssl23_get_server_method(int ver);
 int ssl23_get_client_hello(SSL *s);
@@ -128,6 +131,10 @@ static const SSL_METHOD *ssl23_get_server_method(int ver)
 		return(SSLv3_server_method());
 	else if (ver == TLS1_VERSION)
 		return(TLSv1_server_method());
+	else if (ver == TLS1_1_VERSION)
+		return(TLSv1_1_server_method());
+	else if (ver == TLS1_2_VERSION)
+		return(TLSv1_2_server_method());
 	else
 		return(NULL);
 	}
@@ -283,7 +290,20 @@ int ssl23_get_client_hello(SSL *s)
 				/* SSLv3/TLSv1 */
 				if (p[4] >= TLS1_VERSION_MINOR)
 					{
-					if (!(s->options & SSL_OP_NO_TLSv1))
+					if (p[4] >= TLS1_2_VERSION_MINOR &&
+					   !(s->options & SSL_OP_NO_TLSv1_2))
+						{
+						s->version=TLS1_2_VERSION;
+						s->state=SSL23_ST_SR_CLNT_HELLO_B;
+						}
+					else if (p[4] >= TLS1_1_VERSION_MINOR &&
+					   !(s->options & SSL_OP_NO_TLSv1_1))
+						{
+						s->version=TLS1_1_VERSION;
+						/* type=2; */ /* done later to survive restarts */
+						s->state=SSL23_ST_SR_CLNT_HELLO_B;
+						}
+					else if (!(s->options & SSL_OP_NO_TLSv1))
 						{
 						s->version=TLS1_VERSION;
 						/* type=2; */ /* done later to survive restarts */
@@ -350,7 +370,19 @@ int ssl23_get_client_hello(SSL *s)
 				v[1]=p[10]; /* minor version according to client_version */
 			if (v[1] >= TLS1_VERSION_MINOR)
 				{
-				if (!(s->options & SSL_OP_NO_TLSv1))
+				if (v[1] >= TLS1_2_VERSION_MINOR &&
+					!(s->options & SSL_OP_NO_TLSv1_2))
+					{
+					s->version=TLS1_2_VERSION;
+					type=3;
+					}
+				else if (v[1] >= TLS1_1_VERSION_MINOR &&
+					!(s->options & SSL_OP_NO_TLSv1_1))
+					{
+					s->version=TLS1_1_VERSION;
+					type=3;
+					}
+				else if (!(s->options & SSL_OP_NO_TLSv1))
 					{
 					s->version=TLS1_VERSION;
 					type=3;
@@ -393,6 +425,15 @@ int ssl23_get_client_hello(SSL *s)
 			}
 		}
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && (s->version < TLS1_VERSION))
+		{
+		SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO,
+					SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+		goto err;
+		}
+#endif
+
 	if (s->state == SSL23_ST_SR_CLNT_HELLO_B)
 		{
 		/* we have SSLv3/TLSv1 in an SSLv2 header
@@ -567,8 +608,11 @@ int ssl23_get_client_hello(SSL *s)
 			s->s3->rbuf.left=0;
 			s->s3->rbuf.offset=0;
 			}
-
-		if (s->version == TLS1_VERSION)
+		if (s->version == TLS1_2_VERSION)
+			s->method = TLSv1_2_server_method();
+		else if (s->version == TLS1_1_VERSION)
+			s->method = TLSv1_1_server_method();
+		else if (s->version == TLS1_VERSION)
 			s->method = TLSv1_server_method();
 		else
 			s->method = SSLv3_server_method();
diff --git a/src/lib/libssl/s3_both.c b/src/lib/libssl/s3_both.c
index a6d869df59..b63460a56d 100644
--- a/src/lib/libssl/s3_both.c
+++ b/src/lib/libssl/s3_both.c
@@ -202,15 +202,38 @@ int ssl3_send_finished(SSL *s, int a, int b, const char *sender, int slen)
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 	}
 
+#ifndef OPENSSL_NO_NEXTPROTONEG
+/* ssl3_take_mac calculates the Finished MAC for the handshakes messages seen to far. */
+static void ssl3_take_mac(SSL *s) {
+	const char *sender;
+	int slen;
+
+	if (s->state & SSL_ST_CONNECT)
+		{
+		sender=s->method->ssl3_enc->server_finished_label;
+		slen=s->method->ssl3_enc->server_finished_label_len;
+		}
+	else
+		{
+		sender=s->method->ssl3_enc->client_finished_label;
+		slen=s->method->ssl3_enc->client_finished_label_len;
+		}
+
+	s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s,
+		sender,slen,s->s3->tmp.peer_finish_md);
+}
+#endif
+
 int ssl3_get_finished(SSL *s, int a, int b)
 	{
 	int al,i,ok;
 	long n;
 	unsigned char *p;
 
-	/* the mac has already been generated when we received the
-	 * change cipher spec message and is in s->s3->tmp.peer_finish_md
-	 */ 
+#ifdef OPENSSL_NO_NEXTPROTONEG
+	/* the mac has already been generated when we received the change
+	 * cipher spec message and is in s->s3->tmp.peer_finish_md. */
+#endif
 
 	n=s->method->ssl_get_message(s,
 		a,
@@ -514,6 +537,13 @@ long ssl3_get_message(SSL *s, int st1, int stn, int mt, long max, int *ok)
 		s->init_num += i;
 		n -= i;
 		}
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	/* If receiving Finished, record MAC of prior handshake messages for
+	 * Finished verification. */
+	if (*s->init_buf->data == SSL3_MT_FINISHED)
+		ssl3_take_mac(s);
+#endif
+	/* Feed this message into MAC computation. */
 	ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4);
 	if (s->msg_callback)
 		s->msg_callback(0, s->version, SSL3_RT_HANDSHAKE, s->init_buf->data, (size_t)s->init_num + 4, s, s->msg_callback_arg);
diff --git a/src/lib/libssl/s3_clnt.c b/src/lib/libssl/s3_clnt.c
index 53223bd38d..b80d052e1f 100644
--- a/src/lib/libssl/s3_clnt.c
+++ b/src/lib/libssl/s3_clnt.c
@@ -156,6 +156,9 @@
 #include <openssl/objects.h>
 #include <openssl/evp.h>
 #include <openssl/md5.h>
+#ifdef OPENSSL_FIPS
+#include <openssl/fips.h>
+#endif
 #ifndef OPENSSL_NO_DH
 #include <openssl/dh.h>
 #endif
@@ -200,6 +203,18 @@ int ssl3_connect(SSL *s)
 	s->in_handshake++;
 	if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); 
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -207,7 +222,7 @@ int ssl3_connect(SSL *s)
 		switch(s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			s->state=SSL_ST_CONNECT;
 			s->ctx->stats.sess_connect_renegotiate++;
 			/* break */
@@ -280,7 +295,16 @@ int ssl3_connect(SSL *s)
 			if (ret <= 0) goto end;
 
 			if (s->hit)
+				{
 				s->state=SSL3_ST_CR_FINISHED_A;
+#ifndef OPENSSL_NO_TLSEXT
+				if (s->tlsext_ticket_expected)
+					{
+					/* receive renewed session ticket */
+					s->state=SSL3_ST_CR_SESSION_TICKET_A;
+					}
+#endif
+				}
 			else
 				s->state=SSL3_ST_CR_CERT_A;
 			s->init_num=0;
@@ -358,6 +382,17 @@ int ssl3_connect(SSL *s)
 		case SSL3_ST_CR_SRVR_DONE_B:
 			ret=ssl3_get_server_done(s);
 			if (ret <= 0) goto end;
+#ifndef OPENSSL_NO_SRP
+			if (s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP)
+				{
+				if ((ret = SRP_Calc_A_param(s))<=0)
+					{
+					SSLerr(SSL_F_SSL3_CONNECT,SSL_R_SRP_A_CALC);
+					ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_INTERNAL_ERROR);
+					goto end;
+					}
+				}
+#endif
 			if (s->s3->tmp.cert_req)
 				s->state=SSL3_ST_CW_CERT_A;
 			else
@@ -423,7 +458,16 @@ int ssl3_connect(SSL *s)
 			ret=ssl3_send_change_cipher_spec(s,
 				SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B);
 			if (ret <= 0) goto end;
+
+
+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
 			s->state=SSL3_ST_CW_FINISHED_A;
+#else
+			if (s->s3->next_proto_neg_seen)
+				s->state=SSL3_ST_CW_NEXT_PROTO_A;
+			else
+				s->state=SSL3_ST_CW_FINISHED_A;
+#endif
 			s->init_num=0;
 
 			s->session->cipher=s->s3->tmp.new_cipher;
@@ -451,6 +495,15 @@ int ssl3_connect(SSL *s)
 
 			break;
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+		case SSL3_ST_CW_NEXT_PROTO_A:
+		case SSL3_ST_CW_NEXT_PROTO_B:
+			ret=ssl3_send_next_proto(s);
+			if (ret <= 0) goto end;
+			s->state=SSL3_ST_CW_FINISHED_A;
+			break;
+#endif
+
 		case SSL3_ST_CW_FINISHED_A:
 		case SSL3_ST_CW_FINISHED_B:
 			ret=ssl3_send_finished(s,
@@ -546,6 +599,7 @@ int ssl3_connect(SSL *s)
 			/* else do it later in ssl3_write */
 
 			s->init_num=0;
+			s->renegotiate=0;
 			s->new_session=0;
 
 			ssl_update_cache(s,SSL_SESS_CACHE_CLIENT);
@@ -635,9 +689,43 @@ int ssl3_client_hello(SSL *s)
 		/* Do the message type and length last */
 		d=p= &(buf[4]);
 
+		/* version indicates the negotiated version: for example from
+		 * an SSLv2/v3 compatible client hello). The client_version
+		 * field is the maximum version we permit and it is also
+		 * used in RSA encrypted premaster secrets. Some servers can
+		 * choke if we initially report a higher version then
+		 * renegotiate to a lower one in the premaster secret. This
+		 * didn't happen with TLS 1.0 as most servers supported it
+		 * but it can with TLS 1.1 or later if the server only supports
+		 * 1.0.
+		 *
+		 * Possible scenario with previous logic:
+		 * 	1. Client hello indicates TLS 1.2
+		 * 	2. Server hello says TLS 1.0
+		 *	3. RSA encrypted premaster secret uses 1.2.
+		 * 	4. Handhaked proceeds using TLS 1.0.
+		 *	5. Server sends hello request to renegotiate.
+		 *	6. Client hello indicates TLS v1.0 as we now
+		 *	   know that is maximum server supports.
+		 *	7. Server chokes on RSA encrypted premaster secret
+		 *	   containing version 1.0.
+		 *
+		 * For interoperability it should be OK to always use the
+		 * maximum version we support in client hello and then rely
+		 * on the checking of version to ensure the servers isn't
+		 * being inconsistent: for example initially negotiating with
+		 * TLS 1.0 and renegotiating with TLS 1.2. We do this by using
+		 * client_version in client hello and not resetting it to
+		 * the negotiated version.
+		 */
+#if 0
 		*(p++)=s->version>>8;
 		*(p++)=s->version&0xff;
 		s->client_version=s->version;
+#else
+		*(p++)=s->client_version>>8;
+		*(p++)=s->client_version&0xff;
+#endif
 
 		/* Random stuff */
 		memcpy(p,s->s3->client_random,SSL3_RANDOM_SIZE);
@@ -667,6 +755,15 @@ int ssl3_client_hello(SSL *s)
 			SSLerr(SSL_F_SSL3_CLIENT_HELLO,SSL_R_NO_CIPHERS_AVAILABLE);
 			goto err;
 			}
+#ifdef OPENSSL_MAX_TLS1_2_CIPHER_LENGTH
+			/* Some servers hang if client hello > 256 bytes
+			 * as hack workaround chop number of supported ciphers
+			 * to keep it well below this if we use TLS v1.2
+			 */
+			if (TLS1_get_version(s) >= TLS1_2_VERSION
+				&& i > OPENSSL_MAX_TLS1_2_CIPHER_LENGTH)
+				i = OPENSSL_MAX_TLS1_2_CIPHER_LENGTH & ~1;
+#endif
 		s2n(i,p);
 		p+=i;
 
@@ -847,6 +944,14 @@ int ssl3_get_server_hello(SSL *s)
 		SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_UNKNOWN_CIPHER_RETURNED);
 		goto f_err;
 		}
+	/* TLS v1.2 only ciphersuites require v1.2 or later */
+	if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+		(TLS1_get_version(s) < TLS1_2_VERSION))
+		{
+		al=SSL_AD_ILLEGAL_PARAMETER;
+		SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_WRONG_CIPHER_RETURNED);
+		goto f_err;
+		}
 	p+=ssl_put_cipher_by_char(s,NULL,NULL);
 
 	sk=ssl_get_ciphers_by_id(s);
@@ -878,9 +983,11 @@ int ssl3_get_server_hello(SSL *s)
 			}
 		}
 	s->s3->tmp.new_cipher=c;
-	if (!ssl3_digest_cached_records(s))
+	/* Don't digest cached records if TLS v1.2: we may need them for
+	 * client authentication.
+	 */
+	if (TLS1_get_version(s) < TLS1_2_VERSION && !ssl3_digest_cached_records(s))
 		goto f_err;
-
 	/* lets get the compression algorithm */
 	/* COMPRESSION */
 #ifdef OPENSSL_NO_COMP
@@ -1159,6 +1266,7 @@ int ssl3_get_key_exchange(SSL *s)
 	int al,i,j,param_len,ok;
 	long n,alg_k,alg_a;
 	EVP_PKEY *pkey=NULL;
+	const EVP_MD *md = NULL;
 #ifndef OPENSSL_NO_RSA
 	RSA *rsa=NULL;
 #endif
@@ -1282,6 +1390,86 @@ int ssl3_get_key_exchange(SSL *s)
 		}
 	else
 #endif /* !OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (alg_k & SSL_kSRP)
+		{
+		n2s(p,i);
+		param_len=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_N_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.N=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		n2s(p,i);
+		param_len+=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_G_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.g=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		i = (unsigned int)(p[0]);
+		p++;
+		param_len+=i+1;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_S_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.s=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+
+		n2s(p,i);
+		param_len+=i+2;
+		if (param_len > n)
+			{
+			al=SSL_AD_DECODE_ERROR;
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_B_LENGTH);
+			goto f_err;
+			}
+		if (!(s->srp_ctx.B=BN_bin2bn(p,i,NULL)))
+			{
+			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB);
+			goto err;
+			}
+		p+=i;
+		n-=param_len;
+
+/* We must check if there is a certificate */
+#ifndef OPENSSL_NO_RSA
+		if (alg_a & SSL_aRSA)
+			pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_RSA_ENC].x509);
+#else
+		if (0)
+			;
+#endif
+#ifndef OPENSSL_NO_DSA
+		else if (alg_a & SSL_aDSS)
+			pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_DSA_SIGN].x509);
+#endif
+		}
+	else
+#endif /* !OPENSSL_NO_SRP */
 #ifndef OPENSSL_NO_RSA
 	if (alg_k & SSL_kRSA)
 		{
@@ -1529,6 +1717,38 @@ int ssl3_get_key_exchange(SSL *s)
 	/* if it was signed, check the signature */
 	if (pkey != NULL)
 		{
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			int sigalg = tls12_get_sigid(pkey);
+			/* Should never happen */
+			if (sigalg == -1)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			/* Check key type is consistent with signature */
+			if (sigalg != (int)p[1])
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_WRONG_SIGNATURE_TYPE);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+			md = tls12_get_hash(p[0]);
+			if (md == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_UNKNOWN_DIGEST);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+#ifdef SSL_DEBUG
+fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md));
+#endif
+			p += 2;
+			n -= 2;
+			}
+		else
+			md = EVP_sha1();
+			
 		n2s(p,i);
 		n-=2;
 		j=EVP_PKEY_size(pkey);
@@ -1542,7 +1762,7 @@ int ssl3_get_key_exchange(SSL *s)
 			}
 
 #ifndef OPENSSL_NO_RSA
-		if (pkey->type == EVP_PKEY_RSA)
+		if (pkey->type == EVP_PKEY_RSA && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			int num;
 
@@ -1550,6 +1770,8 @@ int ssl3_get_key_exchange(SSL *s)
 			q=md_buf;
 			for (num=2; num > 0; num--)
 				{
+				EVP_MD_CTX_set_flags(&md_ctx,
+					EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 				EVP_DigestInit_ex(&md_ctx,(num == 2)
 					?s->ctx->md5:s->ctx->sha1, NULL);
 				EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
@@ -1577,29 +1799,8 @@ int ssl3_get_key_exchange(SSL *s)
 			}
 		else
 #endif
-#ifndef OPENSSL_NO_DSA
-			if (pkey->type == EVP_PKEY_DSA)
-			{
-			/* lets do DSS */
-			EVP_VerifyInit_ex(&md_ctx,EVP_dss1(), NULL);
-			EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-			EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
-			EVP_VerifyUpdate(&md_ctx,param,param_len);
-			if (EVP_VerifyFinal(&md_ctx,p,(int)n,pkey) <= 0)
-				{
-				/* bad signature */
-				al=SSL_AD_DECRYPT_ERROR;
-				SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SIGNATURE);
-				goto f_err;
-				}
-			}
-		else
-#endif
-#ifndef OPENSSL_NO_ECDSA
-			if (pkey->type == EVP_PKEY_EC)
 			{
-			/* let's do ECDSA */
-			EVP_VerifyInit_ex(&md_ctx,EVP_ecdsa(), NULL);
+			EVP_VerifyInit_ex(&md_ctx, md, NULL);
 			EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
 			EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
 			EVP_VerifyUpdate(&md_ctx,param,param_len);
@@ -1611,12 +1812,6 @@ int ssl3_get_key_exchange(SSL *s)
 				goto f_err;
 				}
 			}
-		else
-#endif
-			{
-			SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
-			goto err;
-			}
 		}
 	else
 		{
@@ -1663,7 +1858,7 @@ int ssl3_get_certificate_request(SSL *s)
 	{
 	int ok,ret=0;
 	unsigned long n,nc,l;
-	unsigned int llen,ctype_num,i;
+	unsigned int llen, ctype_num,i;
 	X509_NAME *xn=NULL;
 	const unsigned char *p,*q;
 	unsigned char *d;
@@ -1683,6 +1878,14 @@ int ssl3_get_certificate_request(SSL *s)
 	if (s->s3->tmp.message_type == SSL3_MT_SERVER_DONE)
 		{
 		s->s3->tmp.reuse_message=1;
+		/* If we get here we don't need any cached handshake records
+		 * as we wont be doing client auth.
+		 */
+		if (s->s3->handshake_buffer)
+			{
+			if (!ssl3_digest_cached_records(s))
+				goto err;
+			}
 		return(1);
 		}
 
@@ -1719,6 +1922,26 @@ int ssl3_get_certificate_request(SSL *s)
 	for (i=0; i<ctype_num; i++)
 		s->s3->tmp.ctype[i]= p[i];
 	p+=ctype_num;
+	if (TLS1_get_version(s) >= TLS1_2_VERSION)
+		{
+		n2s(p, llen);
+		/* Check we have enough room for signature algorithms and
+		 * following length value.
+		 */
+		if ((unsigned long)(p - d + llen + 2) > n)
+			{
+			ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
+			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_DATA_LENGTH_TOO_LONG);
+			goto err;
+			}
+		if ((llen & 1) || !tls1_process_sigalgs(s, p, llen))
+			{
+			ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
+			SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_SIGNATURE_ALGORITHMS_ERROR);
+			goto err;
+			}
+		p += llen;
+		}
 
 	/* get the CA RDNs */
 	n2s(p,llen);
@@ -1731,7 +1954,7 @@ fclose(out);
 }
 #endif
 
-	if ((llen+ctype_num+2+1) != n)
+	if ((unsigned long)(p - d + llen) != n)
 		{
 		ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR);
 		SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_LENGTH_MISMATCH);
@@ -2553,6 +2776,39 @@ int ssl3_send_client_key_exchange(SSL *s)
 			EVP_PKEY_free(pub_key);
 
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (alg_k & SSL_kSRP)
+			{
+			if (s->srp_ctx.A != NULL)
+				{
+				/* send off the data */
+				n=BN_num_bytes(s->srp_ctx.A);
+				s2n(n,p);
+				BN_bn2bin(s->srp_ctx.A,p);
+				n+=2;
+				}
+			else
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			if (s->session->srp_username != NULL)
+				OPENSSL_free(s->session->srp_username);
+			s->session->srp_username = BUF_strdup(s->srp_ctx.login);
+			if (s->session->srp_username == NULL)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,
+					ERR_R_MALLOC_FAILURE);
+				goto err;
+				}
+
+			if ((s->session->master_key_length = SRP_generate_client_master_secret(s,s->session->master_key))<0)
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			}
+#endif
 #ifndef OPENSSL_NO_PSK
 		else if (alg_k & SSL_kPSK)
 			{
@@ -2672,12 +2928,13 @@ int ssl3_send_client_verify(SSL *s)
 	unsigned char data[MD5_DIGEST_LENGTH+SHA_DIGEST_LENGTH];
 	EVP_PKEY *pkey;
 	EVP_PKEY_CTX *pctx=NULL;
-#ifndef OPENSSL_NO_RSA
+	EVP_MD_CTX mctx;
 	unsigned u=0;
-#endif
 	unsigned long n;
 	int j;
 
+	EVP_MD_CTX_init(&mctx);
+
 	if (s->state == SSL3_ST_CW_CERT_VRFY_A)
 		{
 		d=(unsigned char *)s->init_buf->data;
@@ -2688,7 +2945,8 @@ int ssl3_send_client_verify(SSL *s)
 		EVP_PKEY_sign_init(pctx);
 		if (EVP_PKEY_CTX_set_signature_md(pctx, EVP_sha1())>0)
 			{
-			s->method->ssl3_enc->cert_verify_mac(s,
+			if (TLS1_get_version(s) < TLS1_2_VERSION)
+				s->method->ssl3_enc->cert_verify_mac(s,
 						NID_sha1,
 						&(data[MD5_DIGEST_LENGTH]));
 			}
@@ -2696,6 +2954,41 @@ int ssl3_send_client_verify(SSL *s)
 			{
 			ERR_clear_error();
 			}
+		/* For TLS v1.2 send signature algorithm and signature
+		 * using agreed digest and cached handshake records.
+		 */
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			long hdatalen = 0;
+			void *hdata;
+			const EVP_MD *md = s->cert->key->digest;
+			hdatalen = BIO_get_mem_data(s->s3->handshake_buffer,
+								&hdata);
+			if (hdatalen <= 0 || !tls12_get_sigandhash(p, pkey, md))
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY,
+						ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+			p += 2;
+#ifdef SSL_DEBUG
+			fprintf(stderr, "Using TLS 1.2 with client alg %s\n",
+							EVP_MD_name(md));
+#endif
+			if (!EVP_SignInit_ex(&mctx, md, NULL)
+				|| !EVP_SignUpdate(&mctx, hdata, hdatalen)
+				|| !EVP_SignFinal(&mctx, p + 2, &u, pkey))
+				{
+				SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY,
+						ERR_R_EVP_LIB);
+				goto err;
+				}
+			s2n(u,p);
+			n = u + 4;
+			if (!ssl3_digest_cached_records(s))
+				goto err;
+			}
+		else
 #ifndef OPENSSL_NO_RSA
 		if (pkey->type == EVP_PKEY_RSA)
 			{
@@ -2778,9 +3071,11 @@ int ssl3_send_client_verify(SSL *s)
 		s->init_num=(int)n+4;
 		s->init_off=0;
 		}
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_CTX_free(pctx);
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 err:
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_CTX_free(pctx);
 	return(-1);
 	}
@@ -2904,7 +3199,7 @@ int ssl3_check_cert_and_algorithm(SSL *s)
 	if (idx == SSL_PKEY_ECC)
 		{
 		if (ssl_check_srvr_ecc_cert_and_alg(sc->peer_pkeys[idx].x509,
-		    s->s3->tmp.new_cipher) == 0) 
+		    						s) == 0) 
 			{ /* check failed */
 			SSLerr(SSL_F_SSL3_CHECK_CERT_AND_ALGORITHM,SSL_R_BAD_ECC_CERT);
 			goto f_err;
@@ -3000,6 +3295,32 @@ err:
 	return(0);
 	}
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+int ssl3_send_next_proto(SSL *s)
+	{
+	unsigned int len, padding_len;
+	unsigned char *d;
+
+	if (s->state == SSL3_ST_CW_NEXT_PROTO_A)
+		{
+		len = s->next_proto_negotiated_len;
+		padding_len = 32 - ((len + 2) % 32);
+		d = (unsigned char *)s->init_buf->data;
+		d[4] = len;
+		memcpy(d + 5, s->next_proto_negotiated, len);
+		d[5 + len] = padding_len;
+		memset(d + 6 + len, 0, padding_len);
+		*(d++)=SSL3_MT_NEXT_PROTO;
+		l2n3(2 + len + padding_len, d);
+		s->state = SSL3_ST_CW_NEXT_PROTO_B;
+		s->init_num = 4 + 2 + len + padding_len;
+		s->init_off = 0;
+		}
+
+	return ssl3_do_write(s, SSL3_RT_HANDSHAKE);
+}
+#endif  /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */
+
 /* Check to see if handshake is full or resumed. Usually this is just a
  * case of checking to see if a cache hit has occurred. In the case of
  * session tickets we have to check the next message to be sure.
diff --git a/src/lib/libssl/s3_lib.c b/src/lib/libssl/s3_lib.c
index 1130244aeb..fb60cde8ee 100644
--- a/src/lib/libssl/s3_lib.c
+++ b/src/lib/libssl/s3_lib.c
@@ -1071,6 +1071,103 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	256,
 	},
 
+	/* TLS v1.2 ciphersuites */
+	/* Cipher 3B */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_NULL_SHA256,
+	TLS1_CK_RSA_WITH_NULL_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_eNULL,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	0,
+	0,
+	},
+
+	/* Cipher 3C */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_RSA_WITH_AES_128_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 3D */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_RSA_WITH_AES_256_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 3E */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_DSS_WITH_AES_128_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_128_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 3F */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_128_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 40 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
 #ifndef OPENSSL_NO_CAMELLIA
 	/* Camellia ciphersuites from RFC4132 (128-bit portion) */
 
@@ -1287,6 +1384,122 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	128,
 	},
 #endif
+
+	/* TLS v1.2 ciphersuites */
+	/* Cipher 67 */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 68 */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_DSS_WITH_AES_256_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_256_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 69 */
+	{
+	0, /* not implemented (non-ephemeral DH) */
+	TLS1_TXT_DH_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_256_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6A */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6B */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher 6C */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_128_SHA256,
+	TLS1_CK_ADH_WITH_AES_128_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher 6D */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_256_SHA256,
+	TLS1_CK_ADH_WITH_AES_256_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* GOST Ciphersuites */
+
 	{
 	1,
 	"GOST94-GOST89-GOST89",
@@ -1610,6 +1823,200 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 
 #endif /* OPENSSL_NO_SEED */
 
+	/* GCM ciphersuites from RFC5288 */
+
+	/* Cipher 9C */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher 9D */
+	{
+	1,
+	TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kRSA,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher 9E */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher 9F */
+	{
+	1,
+	TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A0 */
+	{
+	0,
+	TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A1 */
+	{
+	0,
+	TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A2 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A3 */
+	{
+	1,
+	TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aDSS,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A4 */
+	{
+	0,
+	TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A5 */
+	{
+	0,
+	TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384,
+	SSL_kDHr,
+	SSL_aDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher A6 */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ADH_WITH_AES_128_GCM_SHA256,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher A7 */
+	{
+	1,
+	TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ADH_WITH_AES_256_GCM_SHA384,
+	SSL_kEDH,
+	SSL_aNULL,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
 #ifndef OPENSSL_NO_ECDH
 	/* Cipher C001 */
 	{
@@ -1621,7 +2028,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1653,7 +2060,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1669,7 +2076,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1685,7 +2092,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1701,7 +2108,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1733,7 +2140,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_3DES,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	168,
 	168,
@@ -1749,7 +2156,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES128,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	128,
 	128,
@@ -1765,7 +2172,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_AES256,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	256,
 	256,
@@ -1781,7 +2188,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	SSL_eNULL,
 	SSL_SHA1,
 	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
 	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
 	0,
 	0,
@@ -1803,214 +2210,624 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={
 	128,
 	},
 
-	/* Cipher C00D */
+	/* Cipher C00D */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_DES_192_CBC3_SHA,
+	TLS1_CK_ECDH_RSA_WITH_DES_192_CBC3_SHA,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C00E */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_CBC_SHA,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C00F */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_CBC_SHA,
+	SSL_kECDHr,
+	SSL_aECDH,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C010 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_NULL_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_NULL_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_eNULL,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	0,
+	0,
+	},
+
+	/* Cipher C011 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_RC4_128_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_RC4_128_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_RC4,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_MEDIUM,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C012 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C013 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C014 */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C015 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_NULL_SHA,
+	TLS1_CK_ECDH_anon_WITH_NULL_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_eNULL,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	0,
+	0,
+	},
+
+	/* Cipher C016 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_RC4_128_SHA,
+	TLS1_CK_ECDH_anon_WITH_RC4_128_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_RC4,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_MEDIUM,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C017 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_DES_192_CBC3_SHA,
+	TLS1_CK_ECDH_anon_WITH_DES_192_CBC3_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C018 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_AES_128_CBC_SHA,
+	TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C019 */
+	{
+	1,
+	TLS1_TXT_ECDH_anon_WITH_AES_256_CBC_SHA,
+	TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA,
+	SSL_kEECDH,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+#endif	/* OPENSSL_NO_ECDH */
+
+#ifndef OPENSSL_NO_SRP
+	/* Cipher C01A */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01B */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01C */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_3DES,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	168,
+	168,
+	},
+
+	/* Cipher C01D */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C01E */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C01F */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_AES128,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	128,
+	128,
+	},
+
+	/* Cipher C020 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aNULL,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C021 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aRSA,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+
+	/* Cipher C022 */
+	{
+	1,
+	TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA,
+	TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA,
+	SSL_kSRP,
+	SSL_aDSS,
+	SSL_AES256,
+	SSL_SHA1,
+	SSL_TLSV1,
+	SSL_NOT_EXP|SSL_HIGH,
+	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	256,
+	256,
+	},
+#endif  /* OPENSSL_NO_SRP */
+#ifndef OPENSSL_NO_ECDH
+
+	/* HMAC based TLS v1.2 ciphersuites from RFC5289 */
+
+	/* Cipher C023 */
+	{
+	1,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
+	},
+
+	/* Cipher C024 */
 	{
 	1,
-	TLS1_TXT_ECDH_RSA_WITH_DES_192_CBC3_SHA,
-	TLS1_CK_ECDH_RSA_WITH_DES_192_CBC3_SHA,
-	SSL_kECDHr,
-	SSL_aECDH,
-	SSL_3DES,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	168,
-	168,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384,
+	SSL_kEECDH,
+	SSL_aECDSA,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
 	},
 
-	/* Cipher C00E */
+	/* Cipher C025 */
 	{
 	1,
-	TLS1_TXT_ECDH_RSA_WITH_AES_128_CBC_SHA,
-	TLS1_CK_ECDH_RSA_WITH_AES_128_CBC_SHA,
-	SSL_kECDHr,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256,
+	SSL_kECDHe,
 	SSL_aECDH,
 	SSL_AES128,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C00F */
+	/* Cipher C026 */
 	{
 	1,
-	TLS1_TXT_ECDH_RSA_WITH_AES_256_CBC_SHA,
-	TLS1_CK_ECDH_RSA_WITH_AES_256_CBC_SHA,
-	SSL_kECDHr,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384,
+	SSL_kECDHe,
 	SSL_aECDH,
 	SSL_AES256,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
 	256,
 	256,
 	},
 
-	/* Cipher C010 */
+	/* Cipher C027 */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_NULL_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_NULL_SHA,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256,
 	SSL_kEECDH,
 	SSL_aRSA,
-	SSL_eNULL,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	0,
-	0,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
 	},
 
-	/* Cipher C011 */
+	/* Cipher C028 */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_RC4_128_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_RC4_128_SHA,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384,
 	SSL_kEECDH,
 	SSL_aRSA,
-	SSL_RC4,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_MEDIUM,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C029 */
+	{
+	1,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128,
+	SSL_SHA256,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C012 */
+	/* Cipher C02A */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_DES_192_CBC3_SHA,
-	SSL_kEECDH,
-	SSL_aRSA,
-	SSL_3DES,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	168,
-	168,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256,
+	SSL_SHA384,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
 	},
 
-	/* Cipher C013 */
+	/* GCM based TLS v1.2 ciphersuites from RFC5289 */
+
+	/* Cipher C02B */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_AES_128_CBC_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_AES_128_CBC_SHA,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,
 	SSL_kEECDH,
-	SSL_aRSA,
-	SSL_AES128,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_aECDSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C014 */
+	/* Cipher C02C */
 	{
 	1,
-	TLS1_TXT_ECDHE_RSA_WITH_AES_256_CBC_SHA,
-	TLS1_CK_ECDHE_RSA_WITH_AES_256_CBC_SHA,
+	TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,
 	SSL_kEECDH,
-	SSL_aRSA,
-	SSL_AES256,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_aECDSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
 	256,
 	256,
 	},
 
-	/* Cipher C015 */
+	/* Cipher C02D */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_NULL_SHA,
-	TLS1_CK_ECDH_anon_WITH_NULL_SHA,
-	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_eNULL,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_STRONG_NONE,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	0,
-	0,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
+	128,
+	128,
 	},
 
-	/* Cipher C016 */
+	/* Cipher C02E */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_RC4_128_SHA,
-	TLS1_CK_ECDH_anon_WITH_RC4_128_SHA,
+	TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
+	},
+
+	/* Cipher C02F */
+	{
+	1,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256,
 	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_RC4,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_MEDIUM,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	SSL_aRSA,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C017 */
+	/* Cipher C030 */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_DES_192_CBC3_SHA,
-	TLS1_CK_ECDH_anon_WITH_DES_192_CBC3_SHA,
+	TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384,
 	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_3DES,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
-	168,
-	168,
+	SSL_aRSA,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
+	256,
+	256,
 	},
 
-	/* Cipher C018 */
+	/* Cipher C031 */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_AES_128_CBC_SHA,
-	TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA,
-	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_AES128,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256,
+	TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES128GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256,
 	128,
 	128,
 	},
 
-	/* Cipher C019 */
+	/* Cipher C032 */
 	{
 	1,
-	TLS1_TXT_ECDH_anon_WITH_AES_256_CBC_SHA,
-	TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA,
-	SSL_kEECDH,
-	SSL_aNULL,
-	SSL_AES256,
-	SSL_SHA1,
-	SSL_TLSV1,
-	SSL_NOT_EXP|SSL_HIGH,
-	SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF,
+	TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384,
+	TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384,
+	SSL_kECDHe,
+	SSL_aECDH,
+	SSL_AES256GCM,
+	SSL_AEAD,
+	SSL_TLSV1_2,
+	SSL_NOT_EXP|SSL_HIGH|SSL_FIPS,
+	SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384,
 	256,
 	256,
 	},
-#endif	/* OPENSSL_NO_ECDH */
+
+#endif /* OPENSSL_NO_ECDH */
+
 
 #ifdef TEMP_GOST_TLS
 /* Cipher FF00 */
@@ -2087,6 +2904,9 @@ SSL3_ENC_METHOD SSLv3_enc_data={
 	SSL3_MD_CLIENT_FINISHED_CONST,4,
 	SSL3_MD_SERVER_FINISHED_CONST,4,
 	ssl3_alert_code,
+	(int (*)(SSL *, unsigned char *, size_t, const char *,
+		 size_t, const unsigned char *, size_t,
+		 int use_context))ssl_undefined_function,
 	};
 
 long ssl3_default_timeout(void)
@@ -2128,6 +2948,9 @@ int ssl3_new(SSL *s)
 
 	s->s3=s3;
 
+#ifndef OPENSSL_NO_SRP
+	SSL_SRP_CTX_init(s);
+#endif
 	s->method->ssl_clear(s);
 	return(1);
 err:
@@ -2168,6 +2991,9 @@ void ssl3_free(SSL *s)
 		BIO_free(s->s3->handshake_buffer);
 	}
 	if (s->s3->handshake_dgst) ssl3_free_digest_list(s);
+#ifndef OPENSSL_NO_SRP
+	SSL_SRP_CTX_free(s);
+#endif
 	OPENSSL_cleanse(s->s3,sizeof *s->s3);
 	OPENSSL_free(s->s3);
 	s->s3=NULL;
@@ -2239,7 +3065,23 @@ void ssl3_clear(SSL *s)
 	s->s3->num_renegotiations=0;
 	s->s3->in_read_app_data=0;
 	s->version=SSL3_VERSION;
+
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+	if (s->next_proto_negotiated)
+		{
+		OPENSSL_free(s->next_proto_negotiated);
+		s->next_proto_negotiated = NULL;
+		s->next_proto_negotiated_len = 0;
+		}
+#endif
+	}
+
+#ifndef OPENSSL_NO_SRP
+static char * MS_CALLBACK srp_password_from_info_cb(SSL *s, void *arg)
+	{
+	return BUF_strdup(s->srp_ctx.info) ;
 	}
+#endif
 
 long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg)
 	{
@@ -2486,6 +3328,27 @@ long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg)
 		ret = 1;
 		break;
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	case SSL_CTRL_TLS_EXT_SEND_HEARTBEAT:
+		if (SSL_version(s) == DTLS1_VERSION || SSL_version(s) == DTLS1_BAD_VER)
+			ret = dtls1_heartbeat(s);
+		else
+			ret = tls1_heartbeat(s);
+		break;
+
+	case SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING:
+		ret = s->tlsext_hb_pending;
+		break;
+
+	case SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS:
+		if (larg)
+			s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_RECV_REQUESTS;
+		else
+			s->tlsext_heartbeat &= ~SSL_TLSEXT_HB_DONT_RECV_REQUESTS;
+		ret = 1;
+		break;
+#endif
+
 #endif /* !OPENSSL_NO_TLSEXT */
 	default:
 		break;
@@ -2718,6 +3581,38 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg)
 		return 1;
 		break;
 
+#ifndef OPENSSL_NO_SRP
+	case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		if (ctx->srp_ctx.login != NULL)
+			OPENSSL_free(ctx->srp_ctx.login);
+		ctx->srp_ctx.login = NULL;
+		if (parg == NULL)
+			break;
+		if (strlen((const char *)parg) > 255 || strlen((const char *)parg) < 1)
+			{
+			SSLerr(SSL_F_SSL3_CTX_CTRL, SSL_R_INVALID_SRP_USERNAME);
+			return 0;
+			} 
+		if ((ctx->srp_ctx.login = BUF_strdup((char *)parg)) == NULL)
+			{
+			SSLerr(SSL_F_SSL3_CTX_CTRL, ERR_R_INTERNAL_ERROR);
+			return 0;
+			}
+		break;
+	case SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD:
+		ctx->srp_ctx.SRP_give_srp_client_pwd_callback=srp_password_from_info_cb;
+		ctx->srp_ctx.info=parg;
+		break;
+	case SSL_CTRL_SET_SRP_ARG:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_cb_arg=parg;
+		break;
+
+	case SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH:
+		ctx->srp_ctx.strength=larg;
+		break;
+#endif
 #endif /* !OPENSSL_NO_TLSEXT */
 
 	/* A Thawte special :-) */
@@ -2730,6 +3625,18 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg)
 		sk_X509_push(ctx->extra_certs,(X509 *)parg);
 		break;
 
+	case SSL_CTRL_GET_EXTRA_CHAIN_CERTS:
+		*(STACK_OF(X509) **)parg =  ctx->extra_certs;
+		break;
+
+	case SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS:
+		if (ctx->extra_certs)
+			{
+			sk_X509_pop_free(ctx->extra_certs, X509_free);
+			ctx->extra_certs = NULL;
+			}
+		break;
+
 	default:
 		return(0);
 		}
@@ -2787,6 +3694,20 @@ long ssl3_ctx_callback_ctrl(SSL_CTX *ctx, int cmd, void (*fp)(void))
 						HMAC_CTX *, int))fp;
 		break;
 
+#ifndef OPENSSL_NO_SRP
+	case SSL_CTRL_SET_SRP_VERIFY_PARAM_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_verify_param_callback=(int (*)(SSL *,void *))fp;
+		break;
+	case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.TLS_ext_srp_username_callback=(int (*)(SSL *,int *,void *))fp;
+		break;
+	case SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB:
+		ctx->srp_ctx.srp_Mask|=SSL_kSRP;
+		ctx->srp_ctx.SRP_give_srp_client_pwd_callback=(char *(*)(SSL *,void *))fp;
+		break;
+#endif
 #endif
 	default:
 		return(0);
@@ -2805,6 +3726,9 @@ const SSL_CIPHER *ssl3_get_cipher_by_char(const unsigned char *p)
 	id=0x03000000L|((unsigned long)p[0]<<8L)|(unsigned long)p[1];
 	c.id=id;
 	cp = OBJ_bsearch_ssl_cipher_id(&c, ssl3_ciphers, SSL3_NUM_CIPHERS);
+#ifdef DEBUG_PRINT_UNKNOWN_CIPHERSUITES
+if (cp == NULL) fprintf(stderr, "Unknown cipher ID %x\n", (p[0] << 8) | p[1]);
+#endif
 	if (cp == NULL || cp->valid == 0)
 		return NULL;
 	else
@@ -2882,11 +3806,20 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt,
 		{
 		c=sk_SSL_CIPHER_value(prio,i);
 
+		/* Skip TLS v1.2 only ciphersuites if lower than v1.2 */
+		if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+			(TLS1_get_version(s) < TLS1_2_VERSION))
+			continue;
+
 		ssl_set_cert_masks(cert,c);
 		mask_k = cert->mask_k;
 		mask_a = cert->mask_a;
 		emask_k = cert->export_mask_k;
 		emask_a = cert->export_mask_a;
+#ifndef OPENSSL_NO_SRP
+		mask_k=cert->mask_k | s->srp_ctx.srp_Mask;
+		emask_k=cert->export_mask_k | s->srp_ctx.srp_Mask;
+#endif
 			
 #ifdef KSSL_DEBUG
 /*		printf("ssl3_choose_cipher %d alg= %lx\n", i,c->algorithms);*/
@@ -3335,4 +4268,15 @@ need to go to SSL_ST_ACCEPT.
 		}
 	return(ret);
 	}
-
+/* If we are using TLS v1.2 or later and default SHA1+MD5 algorithms switch
+ * to new SHA256 PRF and handshake macs
+ */
+long ssl_get_algorithm2(SSL *s)
+	{
+	long alg2 = s->s3->tmp.new_cipher->algorithm2;
+	if (TLS1_get_version(s) >= TLS1_2_VERSION &&
+	    alg2 == (SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF))
+		return SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256;
+	return alg2;
+	}
+		
diff --git a/src/lib/libssl/s3_pkt.c b/src/lib/libssl/s3_pkt.c
index f9b3629cf7..adf8c387cc 100644
--- a/src/lib/libssl/s3_pkt.c
+++ b/src/lib/libssl/s3_pkt.c
@@ -115,6 +115,7 @@
 #include "ssl_locl.h"
 #include <openssl/evp.h>
 #include <openssl/buffer.h>
+#include <openssl/rand.h>
 
 static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 			 unsigned int len, int create_empty_fragment);
@@ -630,6 +631,7 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 	unsigned char *p,*plen;
 	int i,mac_size,clear=0;
 	int prefix_len=0;
+	int eivlen;
 	long align=0;
 	SSL3_RECORD *wr;
 	SSL3_BUFFER *wb=&(s->s3->wbuf);
@@ -662,10 +664,14 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 	if (	(sess == NULL) ||
 		(s->enc_write_ctx == NULL) ||
 		(EVP_MD_CTX_md(s->write_hash) == NULL))
+		{
+#if 1
+		clear=s->enc_write_ctx?0:1;	/* must be AEAD cipher */
+#else
 		clear=1;
-
-	if (clear)
+#endif
 		mac_size=0;
+		}
 	else
 		{
 		mac_size=EVP_MD_CTX_size(s->write_hash);
@@ -734,14 +740,39 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 	wr->type=type;
 
 	*(p++)=(s->version>>8);
-	*(p++)=s->version&0xff;
+	/* Some servers hang if iniatial client hello is larger than 256
+	 * bytes and record version number > TLS 1.0
+	 */
+	if (s->state == SSL3_ST_CW_CLNT_HELLO_B
+				&& TLS1_get_version(s) > TLS1_VERSION)
+		*(p++) = 0x1;
+	else
+		*(p++)=s->version&0xff;
 
 	/* field where we are to write out packet length */
 	plen=p; 
 	p+=2;
+	/* Explicit IV length, block ciphers and TLS version 1.1 or later */
+	if (s->enc_write_ctx && s->version >= TLS1_1_VERSION)
+		{
+		int mode = EVP_CIPHER_CTX_mode(s->enc_write_ctx);
+		if (mode == EVP_CIPH_CBC_MODE)
+			{
+			eivlen = EVP_CIPHER_CTX_iv_length(s->enc_write_ctx);
+			if (eivlen <= 1)
+				eivlen = 0;
+			}
+		/* Need explicit part of IV for GCM mode */
+		else if (mode == EVP_CIPH_GCM_MODE)
+			eivlen = EVP_GCM_TLS_EXPLICIT_IV_LEN;
+		else
+			eivlen = 0;
+		}
+	else 
+		eivlen = 0;
 
 	/* lets setup the record stuff. */
-	wr->data=p;
+	wr->data=p + eivlen;
 	wr->length=(int)len;
 	wr->input=(unsigned char *)buf;
 
@@ -769,11 +800,19 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf,
 
 	if (mac_size != 0)
 		{
-		if (s->method->ssl3_enc->mac(s,&(p[wr->length]),1) < 0)
+		if (s->method->ssl3_enc->mac(s,&(p[wr->length + eivlen]),1) < 0)
 			goto err;
 		wr->length+=mac_size;
-		wr->input=p;
-		wr->data=p;
+		}
+
+	wr->input=p;
+	wr->data=p;
+
+	if (eivlen)
+		{
+	/*	if (RAND_pseudo_bytes(p, eivlen) <= 0)
+			goto err; */
+		wr->length += eivlen;
 		}
 
 	/* ssl3_enc can only have an error on read */
@@ -1042,6 +1081,19 @@ start:
 			dest = s->s3->alert_fragment;
 			dest_len = &s->s3->alert_fragment_len;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (rr->type == TLS1_RT_HEARTBEAT)
+			{
+			tls1_process_heartbeat(s);
+
+			/* Exit and notify application to read again */
+			rr->length = 0;
+			s->rwstate=SSL_READING;
+			BIO_clear_retry_flags(SSL_get_rbio(s));
+			BIO_set_retry_read(SSL_get_rbio(s));
+			return(-1);
+			}
+#endif
 
 		if (dest_maxlen > 0)
 			{
@@ -1185,6 +1237,10 @@ start:
 				SSLerr(SSL_F_SSL3_READ_BYTES,SSL_R_NO_RENEGOTIATION);
 				goto f_err;
 				}
+#ifdef SSL_AD_MISSING_SRP_USERNAME
+			if (alert_descr == SSL_AD_MISSING_SRP_USERNAME)
+				return(0);
+#endif
 			}
 		else if (alert_level == 2) /* fatal */
 			{
@@ -1263,6 +1319,7 @@ start:
 #else
 			s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT;
 #endif
+			s->renegotiate=1;
 			s->new_session=1;
 			}
 		i=s->handshake_func(s);
@@ -1296,8 +1353,10 @@ start:
 		{
 	default:
 #ifndef OPENSSL_NO_TLS
-		/* TLS just ignores unknown message types */
-		if (s->version == TLS1_VERSION)
+		/* TLS up to v1.1 just ignores unknown message types:
+		 * TLS v1.2 give an unexpected message alert.
+		 */
+		if (s->version >= TLS1_VERSION && s->version <= TLS1_1_VERSION)
 			{
 			rr->length = 0;
 			goto start;
diff --git a/src/lib/libssl/s3_srvr.c b/src/lib/libssl/s3_srvr.c
index d734c359fb..118939fabb 100644
--- a/src/lib/libssl/s3_srvr.c
+++ b/src/lib/libssl/s3_srvr.c
@@ -179,6 +179,31 @@ static const SSL_METHOD *ssl3_get_server_method(int ver)
 		return(NULL);
 	}
 
+#ifndef OPENSSL_NO_SRP
+static int ssl_check_srp_ext_ClientHello(SSL *s, int *al)
+	{
+	int ret = SSL_ERROR_NONE;
+
+	*al = SSL_AD_UNRECOGNIZED_NAME;
+
+	if ((s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP) &&
+	    (s->srp_ctx.TLS_ext_srp_username_callback != NULL))
+		{
+		if(s->srp_ctx.login == NULL)
+			{
+			/* There isn't any srp login extension !!! */
+			ret = SSL3_AL_FATAL;
+			*al = SSL_AD_UNKNOWN_PSK_IDENTITY;
+			}
+		else
+			{
+			ret = SSL_srp_server_param_with_username(s,al);
+			}
+		}
+	return ret;
+	}
+#endif
+
 IMPLEMENT_ssl3_meth_func(SSLv3_server_method,
 			ssl3_accept,
 			ssl_undefined_function,
@@ -211,6 +236,18 @@ int ssl3_accept(SSL *s)
 		return(-1);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* If we're awaiting a HeartbeatResponse, pretend we
+	 * already got and don't await it anymore, because
+	 * Heartbeats don't make sense during handshakes anyway.
+	 */
+	if (s->tlsext_hb_pending)
+		{
+		s->tlsext_hb_pending = 0;
+		s->tlsext_hb_seq++;
+		}
+#endif
+
 	for (;;)
 		{
 		state=s->state;
@@ -218,7 +255,7 @@ int ssl3_accept(SSL *s)
 		switch (s->state)
 			{
 		case SSL_ST_RENEGOTIATE:
-			s->new_session=1;
+			s->renegotiate=1;
 			/* s->state=SSL_ST_ACCEPT; */
 
 		case SSL_ST_BEFORE:
@@ -314,10 +351,34 @@ int ssl3_accept(SSL *s)
 		case SSL3_ST_SR_CLNT_HELLO_C:
 
 			s->shutdown=0;
-			ret=ssl3_get_client_hello(s);
-			if (ret <= 0) goto end;
-			
-			s->new_session = 2;
+			if (s->rwstate != SSL_X509_LOOKUP)
+			{
+				ret=ssl3_get_client_hello(s);
+				if (ret <= 0) goto end;
+			}
+#ifndef OPENSSL_NO_SRP
+			{
+			int al;
+			if ((ret = ssl_check_srp_ext_ClientHello(s,&al))  < 0)
+					{
+					/* callback indicates firther work to be done */
+					s->rwstate=SSL_X509_LOOKUP;
+					goto end;
+					}
+			if (ret != SSL_ERROR_NONE)
+				{
+				ssl3_send_alert(s,SSL3_AL_FATAL,al);	
+				/* This is not really an error but the only means to
+                                   for a client to detect whether srp is supported. */
+ 				   if (al != TLS1_AD_UNKNOWN_PSK_IDENTITY) 	
+					SSLerr(SSL_F_SSL3_ACCEPT,SSL_R_CLIENTHELLO_TLSEXT);			
+				ret = SSL_TLSEXT_ERR_ALERT_FATAL;			
+				ret= -1;
+				goto end;	
+				}
+			}
+#endif		
+			s->renegotiate = 2;
 			s->state=SSL3_ST_SW_SRVR_HELLO_A;
 			s->init_num=0;
 			break;
@@ -346,7 +407,7 @@ int ssl3_accept(SSL *s)
 		case SSL3_ST_SW_CERT_A:
 		case SSL3_ST_SW_CERT_B:
 			/* Check if it is anon DH or anon ECDH, */
-			/* normal PSK or KRB5 */
+			/* normal PSK or KRB5 or SRP */
 			if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 				&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK)
 				&& !(s->s3->tmp.new_cipher->algorithm_auth & SSL_aKRB5))
@@ -410,6 +471,10 @@ int ssl3_accept(SSL *s)
 			 * hint if provided */
 #ifndef OPENSSL_NO_PSK
 			    || ((alg_k & SSL_kPSK) && s->ctx->psk_identity_hint)
+#endif
+#ifndef OPENSSL_NO_SRP
+			    /* SRP: send ServerKeyExchange */
+			    || (alg_k & SSL_kSRP)
 #endif
 			    || (alg_k & (SSL_kDHr|SSL_kDHd|SSL_kEDH))
 			    || (alg_k & SSL_kEECDH)
@@ -457,6 +522,9 @@ int ssl3_accept(SSL *s)
 				skip=1;
 				s->s3->tmp.cert_request=0;
 				s->state=SSL3_ST_SW_SRVR_DONE_A;
+				if (s->s3->handshake_buffer)
+					if (!ssl3_digest_cached_records(s))
+						return -1;
 				}
 			else
 				{
@@ -539,9 +607,34 @@ int ssl3_accept(SSL *s)
 				 * the client uses its key from the certificate
 				 * for key exchange.
 				 */
+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
 				s->state=SSL3_ST_SR_FINISHED_A;
+#else
+				if (s->s3->next_proto_neg_seen)
+					s->state=SSL3_ST_SR_NEXT_PROTO_A;
+				else
+					s->state=SSL3_ST_SR_FINISHED_A;
+#endif
 				s->init_num = 0;
 				}
+			else if (TLS1_get_version(s) >= TLS1_2_VERSION)
+				{
+				s->state=SSL3_ST_SR_CERT_VRFY_A;
+				s->init_num=0;
+				if (!s->session->peer)
+					break;
+				/* For TLS v1.2 freeze the handshake buffer
+				 * at this point and digest cached records.
+				 */
+				if (!s->s3->handshake_buffer)
+					{
+					SSLerr(SSL_F_SSL3_ACCEPT,ERR_R_INTERNAL_ERROR);
+					return -1;
+					}
+				s->s3->flags |= TLS1_FLAGS_KEEP_HANDSHAKE;
+				if (!ssl3_digest_cached_records(s))
+					return -1;
+				}
 			else
 				{
 				int offset=0;
@@ -582,23 +675,37 @@ int ssl3_accept(SSL *s)
 			ret=ssl3_get_cert_verify(s);
 			if (ret <= 0) goto end;
 
+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
 			s->state=SSL3_ST_SR_FINISHED_A;
+#else
+			if (s->s3->next_proto_neg_seen)
+				s->state=SSL3_ST_SR_NEXT_PROTO_A;
+			else
+				s->state=SSL3_ST_SR_FINISHED_A;
+#endif
 			s->init_num=0;
 			break;
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+		case SSL3_ST_SR_NEXT_PROTO_A:
+		case SSL3_ST_SR_NEXT_PROTO_B:
+			ret=ssl3_get_next_proto(s);
+			if (ret <= 0) goto end;
+			s->init_num = 0;
+			s->state=SSL3_ST_SR_FINISHED_A;
+			break;
+#endif
+
 		case SSL3_ST_SR_FINISHED_A:
 		case SSL3_ST_SR_FINISHED_B:
 			ret=ssl3_get_finished(s,SSL3_ST_SR_FINISHED_A,
 				SSL3_ST_SR_FINISHED_B);
 			if (ret <= 0) goto end;
-#ifndef OPENSSL_NO_TLSEXT
-			if (s->tlsext_ticket_expected)
-				s->state=SSL3_ST_SW_SESSION_TICKET_A;
-			else if (s->hit)
-				s->state=SSL_ST_OK;
-#else
 			if (s->hit)
 				s->state=SSL_ST_OK;
+#ifndef OPENSSL_NO_TLSEXT
+			else if (s->tlsext_ticket_expected)
+				s->state=SSL3_ST_SW_SESSION_TICKET_A;
 #endif
 			else
 				s->state=SSL3_ST_SW_CHANGE_A;
@@ -656,7 +763,16 @@ int ssl3_accept(SSL *s)
 			if (ret <= 0) goto end;
 			s->state=SSL3_ST_SW_FLUSH;
 			if (s->hit)
+				{
+#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG)
 				s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
+#else
+				if (s->s3->next_proto_neg_seen)
+					s->s3->tmp.next_state=SSL3_ST_SR_NEXT_PROTO_A;
+				else
+					s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A;
+#endif
+				}
 			else
 				s->s3->tmp.next_state=SSL_ST_OK;
 			s->init_num=0;
@@ -674,11 +790,9 @@ int ssl3_accept(SSL *s)
 
 			s->init_num=0;
 
-			if (s->new_session == 2) /* skipped if we just sent a HelloRequest */
+			if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */
 				{
-				/* actually not necessarily a 'new' session unless
-				 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
-				
+				s->renegotiate=0;
 				s->new_session=0;
 				
 				ssl_update_cache(s,SSL_SESS_CACHE_SERVER);
@@ -756,14 +870,6 @@ int ssl3_check_client_hello(SSL *s)
 	int ok;
 	long n;
 
-	/* We only allow the client to restart the handshake once per
-	 * negotiation. */
-	if (s->s3->flags & SSL3_FLAGS_SGC_RESTART_DONE)
-		{
-		SSLerr(SSL_F_SSL3_CHECK_CLIENT_HELLO, SSL_R_MULTIPLE_SGC_RESTARTS);
-		return -1;
-		}
-
 	/* this function is called when we really expect a Certificate message,
 	 * so permit appropriate message length */
 	n=s->method->ssl_get_message(s,
@@ -776,6 +882,13 @@ int ssl3_check_client_hello(SSL *s)
 	s->s3->tmp.reuse_message = 1;
 	if (s->s3->tmp.message_type == SSL3_MT_CLIENT_HELLO)
 		{
+		/* We only allow the client to restart the handshake once per
+		 * negotiation. */
+		if (s->s3->flags & SSL3_FLAGS_SGC_RESTART_DONE)
+			{
+			SSLerr(SSL_F_SSL3_CHECK_CLIENT_HELLO, SSL_R_MULTIPLE_SGC_RESTARTS);
+			return -1;
+			}
 		/* Throw away what we have done so far in the current handshake,
 		 * which will now be aborted. (A full SSL_clear would be too much.) */
 #ifndef OPENSSL_NO_DH
@@ -817,7 +930,8 @@ int ssl3_get_client_hello(SSL *s)
 	 * If we are SSLv3, we will respond with SSLv3, even if prompted with
 	 * TLSv1.
 	 */
-	if (s->state == SSL3_ST_SR_CLNT_HELLO_A)
+	if (s->state == SSL3_ST_SR_CLNT_HELLO_A
+		)
 		{
 		s->state=SSL3_ST_SR_CLNT_HELLO_B;
 		}
@@ -874,13 +988,16 @@ int ssl3_get_client_hello(SSL *s)
 	j= *(p++);
 
 	s->hit=0;
-	/* Versions before 0.9.7 always allow session reuse during renegotiation
-	 * (i.e. when s->new_session is true), option
-	 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is new with 0.9.7.
-	 * Maybe this optional behaviour should always have been the default,
-	 * but we cannot safely change the default behaviour (or new applications
-	 * might be written that become totally unsecure when compiled with
-	 * an earlier library version)
+	/* Versions before 0.9.7 always allow clients to resume sessions in renegotiation.
+	 * 0.9.7 and later allow this by default, but optionally ignore resumption requests
+	 * with flag SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION (it's a new flag rather
+	 * than a change to default behavior so that applications relying on this for security
+	 * won't even compile against older library versions).
+	 *
+	 * 1.0.1 and later also have a function SSL_renegotiate_abbreviated() to request
+	 * renegotiation but not a new session (s->new_session remains unset): for servers,
+	 * this essentially just means that the SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION
+	 * setting will be ignored.
 	 */
 	if ((s->new_session && (s->options & SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION)))
 		{
@@ -1269,8 +1386,11 @@ int ssl3_get_client_hello(SSL *s)
 		s->s3->tmp.new_cipher=s->session->cipher;
 		}
 
-	if (!ssl3_digest_cached_records(s))
-		goto f_err;
+	if (TLS1_get_version(s) < TLS1_2_VERSION || !(s->verify_mode & SSL_VERIFY_PEER))
+		{
+		if (!ssl3_digest_cached_records(s))
+			goto f_err;
+		}
 	
 	/* we now have the following setup. 
 	 * client_random
@@ -1325,20 +1445,20 @@ int ssl3_send_server_hello(SSL *s)
 		memcpy(p,s->s3->server_random,SSL3_RANDOM_SIZE);
 		p+=SSL3_RANDOM_SIZE;
 
-		/* now in theory we have 3 options to sending back the
-		 * session id.  If it is a re-use, we send back the
-		 * old session-id, if it is a new session, we send
-		 * back the new session-id or we send back a 0 length
-		 * session-id if we want it to be single use.
-		 * Currently I will not implement the '0' length session-id
-		 * 12-Jan-98 - I'll now support the '0' length stuff.
-		 *
-		 * We also have an additional case where stateless session
-		 * resumption is successful: we always send back the old
-		 * session id. In this case s->hit is non zero: this can
-		 * only happen if stateless session resumption is succesful
-		 * if session caching is disabled so existing functionality
-		 * is unaffected.
+		/* There are several cases for the session ID to send
+		 * back in the server hello:
+		 * - For session reuse from the session cache,
+		 *   we send back the old session ID.
+		 * - If stateless session reuse (using a session ticket)
+		 *   is successful, we send back the client's "session ID"
+		 *   (which doesn't actually identify the session).
+		 * - If it is a new session, we send back the new
+		 *   session ID.
+		 * - However, if we want the new session to be single-use,
+		 *   we send back a 0-length session ID.
+		 * s->hit is non-zero in either case of session reuse,
+		 * so the following won't overwrite an ID that we're supposed
+		 * to send back.
 		 */
 		if (!(s->ctx->session_cache_mode & SSL_SESS_CACHE_SERVER)
 			&& !s->hit)
@@ -1439,6 +1559,7 @@ int ssl3_send_server_key_exchange(SSL *s)
 	BN_CTX *bn_ctx = NULL; 
 #endif
 	EVP_PKEY *pkey;
+	const EVP_MD *md = NULL;
 	unsigned char *p,*d;
 	int al,i;
 	unsigned long type;
@@ -1679,21 +1800,44 @@ int ssl3_send_server_key_exchange(SSL *s)
 				}
 			else
 #endif /* !OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+		if (type & SSL_kSRP)
+			{
+			if ((s->srp_ctx.N == NULL) ||
+				(s->srp_ctx.g == NULL) ||
+				(s->srp_ctx.s == NULL) ||
+				(s->srp_ctx.B == NULL))
+				{
+				SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_MISSING_SRP_PARAM);
+				goto err;
+				}
+			r[0]=s->srp_ctx.N;
+			r[1]=s->srp_ctx.g;
+			r[2]=s->srp_ctx.s;
+			r[3]=s->srp_ctx.B;
+			}
+		else 
+#endif
 			{
 			al=SSL_AD_HANDSHAKE_FAILURE;
 			SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE);
 			goto f_err;
 			}
-		for (i=0; r[i] != NULL; i++)
+		for (i=0; r[i] != NULL && i<4; i++)
 			{
 			nr[i]=BN_num_bytes(r[i]);
+#ifndef OPENSSL_NO_SRP
+			if ((i == 2) && (type & SSL_kSRP))
+				n+=1+nr[i];
+			else
+#endif
 			n+=2+nr[i];
 			}
 
 		if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL)
 			&& !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK))
 			{
-			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher))
+			if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher,&md))
 				== NULL)
 				{
 				al=SSL_AD_DECODE_ERROR;
@@ -1715,8 +1859,16 @@ int ssl3_send_server_key_exchange(SSL *s)
 		d=(unsigned char *)s->init_buf->data;
 		p= &(d[4]);
 
-		for (i=0; r[i] != NULL; i++)
+		for (i=0; r[i] != NULL && i<4; i++)
 			{
+#ifndef OPENSSL_NO_SRP
+			if ((i == 2) && (type & SSL_kSRP))
+				{
+				*p = nr[i];
+				p++;
+				}
+			else
+#endif
 			s2n(nr[i],p);
 			BN_bn2bin(r[i],p);
 			p+=nr[i];
@@ -1764,12 +1916,15 @@ int ssl3_send_server_key_exchange(SSL *s)
 			/* n is the length of the params, they start at &(d[4])
 			 * and p points to the space at the end. */
 #ifndef OPENSSL_NO_RSA
-			if (pkey->type == EVP_PKEY_RSA)
+			if (pkey->type == EVP_PKEY_RSA
+					&& TLS1_get_version(s) < TLS1_2_VERSION)
 				{
 				q=md_buf;
 				j=0;
 				for (num=2; num > 0; num--)
 					{
+					EVP_MD_CTX_set_flags(&md_ctx,
+						EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
 					EVP_DigestInit_ex(&md_ctx,(num == 2)
 						?s->ctx->md5:s->ctx->sha1, NULL);
 					EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
@@ -1791,44 +1946,41 @@ int ssl3_send_server_key_exchange(SSL *s)
 				}
 			else
 #endif
-#if !defined(OPENSSL_NO_DSA)
-				if (pkey->type == EVP_PKEY_DSA)
+			if (md)
 				{
-				/* lets do DSS */
-				EVP_SignInit_ex(&md_ctx,EVP_dss1(), NULL);
-				EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
-				EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
-				EVP_SignUpdate(&md_ctx,&(d[4]),n);
-				if (!EVP_SignFinal(&md_ctx,&(p[2]),
-					(unsigned int *)&i,pkey))
+				/* For TLS1.2 and later send signature
+				 * algorithm */
+				if (TLS1_get_version(s) >= TLS1_2_VERSION)
 					{
-					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_DSA);
-					goto err;
+					if (!tls12_get_sigandhash(p, pkey, md))
+						{
+						/* Should never happen */
+						al=SSL_AD_INTERNAL_ERROR;
+						SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+						goto f_err;
+						}
+					p+=2;
 					}
-				s2n(i,p);
-				n+=i+2;
-				}
-			else
+#ifdef SSL_DEBUG
+				fprintf(stderr, "Using hash %s\n",
+							EVP_MD_name(md));
 #endif
-#if !defined(OPENSSL_NO_ECDSA)
-				if (pkey->type == EVP_PKEY_EC)
-				{
-				/* let's do ECDSA */
-				EVP_SignInit_ex(&md_ctx,EVP_ecdsa(), NULL);
+				EVP_SignInit_ex(&md_ctx, md, NULL);
 				EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE);
 				EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE);
 				EVP_SignUpdate(&md_ctx,&(d[4]),n);
 				if (!EVP_SignFinal(&md_ctx,&(p[2]),
 					(unsigned int *)&i,pkey))
 					{
-					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_ECDSA);
+					SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_EVP);
 					goto err;
 					}
 				s2n(i,p);
 				n+=i+2;
+				if (TLS1_get_version(s) >= TLS1_2_VERSION)
+					n+= 2;
 				}
 			else
-#endif
 				{
 				/* Is this error check actually needed? */
 				al=SSL_AD_HANDSHAKE_FAILURE;
@@ -1881,6 +2033,14 @@ int ssl3_send_certificate_request(SSL *s)
 		p+=n;
 		n++;
 
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			nl = tls12_get_req_sig_algs(s, p + 2);
+			s2n(nl, p);
+			p += nl + 2;
+			n += nl + 2;
+			}
+
 		off=n;
 		p+=2;
 		n+=2;
@@ -2600,6 +2760,44 @@ int ssl3_get_client_key_exchange(SSL *s)
 			}
 		else
 #endif
+#ifndef OPENSSL_NO_SRP
+		if (alg_k & SSL_kSRP)
+			{
+			int param_len;
+
+			n2s(p,i);
+			param_len=i+2;
+			if (param_len > n)
+				{
+				al=SSL_AD_DECODE_ERROR;
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_BAD_SRP_A_LENGTH);
+				goto f_err;
+				}
+			if (!(s->srp_ctx.A=BN_bin2bn(p,i,NULL)))
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_BN_LIB);
+				goto err;
+				}
+			if (s->session->srp_username != NULL)
+				OPENSSL_free(s->session->srp_username);
+			s->session->srp_username = BUF_strdup(s->srp_ctx.login);
+			if (s->session->srp_username == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,
+					ERR_R_MALLOC_FAILURE);
+				goto err;
+				}
+
+			if ((s->session->master_key_length = SRP_generate_server_master_secret(s,s->session->master_key))<0)
+				{
+				SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR);
+				goto err;
+				}
+
+			p+=i;
+			}
+		else
+#endif	/* OPENSSL_NO_SRP */
 		if (alg_k & SSL_kGOST) 
 			{
 			int ret = 0;
@@ -2683,7 +2881,7 @@ int ssl3_get_client_key_exchange(SSL *s)
 	return(1);
 f_err:
 	ssl3_send_alert(s,SSL3_AL_FATAL,al);
-#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH)
+#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH) || defined(OPENSSL_NO_SRP)
 err:
 #endif
 #ifndef OPENSSL_NO_ECDH
@@ -2704,12 +2902,15 @@ int ssl3_get_cert_verify(SSL *s)
 	long n;
 	int type=0,i,j;
 	X509 *peer;
+	const EVP_MD *md = NULL;
+	EVP_MD_CTX mctx;
+	EVP_MD_CTX_init(&mctx);
 
 	n=s->method->ssl_get_message(s,
 		SSL3_ST_SR_CERT_VRFY_A,
 		SSL3_ST_SR_CERT_VRFY_B,
 		-1,
-		514, /* 514? */
+		516, /* Enough for 4096 bit RSA key with TLS v1.2 */
 		&ok);
 
 	if (!ok) return((int)n);
@@ -2729,7 +2930,7 @@ int ssl3_get_cert_verify(SSL *s)
 	if (s->s3->tmp.message_type != SSL3_MT_CERTIFICATE_VERIFY)
 		{
 		s->s3->tmp.reuse_message=1;
-		if ((peer != NULL) && (type | EVP_PKT_SIGN))
+		if ((peer != NULL) && (type & EVP_PKT_SIGN))
 			{
 			al=SSL_AD_UNEXPECTED_MESSAGE;
 			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_MISSING_VERIFY_MESSAGE);
@@ -2772,6 +2973,36 @@ int ssl3_get_cert_verify(SSL *s)
 		} 
 	else 
 		{	
+		if (TLS1_get_version(s) >= TLS1_2_VERSION)
+			{
+			int sigalg = tls12_get_sigid(pkey);
+			/* Should never happen */
+			if (sigalg == -1)
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,ERR_R_INTERNAL_ERROR);
+				al=SSL_AD_INTERNAL_ERROR;
+				goto f_err;
+				}
+			/* Check key type is consistent with signature */
+			if (sigalg != (int)p[1])
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_WRONG_SIGNATURE_TYPE);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+			md = tls12_get_hash(p[0]);
+			if (md == NULL)
+				{
+				SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_UNKNOWN_DIGEST);
+				al=SSL_AD_DECODE_ERROR;
+				goto f_err;
+				}
+#ifdef SSL_DEBUG
+fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md));
+#endif
+			p += 2;
+			n -= 2;
+			}
 		n2s(p,i);
 		n-=2;
 		if (i > n)
@@ -2789,6 +3020,37 @@ int ssl3_get_cert_verify(SSL *s)
 		goto f_err;
 		}
 
+	if (TLS1_get_version(s) >= TLS1_2_VERSION)
+		{
+		long hdatalen = 0;
+		void *hdata;
+		hdatalen = BIO_get_mem_data(s->s3->handshake_buffer, &hdata);
+		if (hdatalen <= 0)
+			{
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_INTERNAL_ERROR);
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
+#ifdef SSL_DEBUG
+		fprintf(stderr, "Using TLS 1.2 with client verify alg %s\n",
+							EVP_MD_name(md));
+#endif
+		if (!EVP_VerifyInit_ex(&mctx, md, NULL)
+			|| !EVP_VerifyUpdate(&mctx, hdata, hdatalen))
+			{
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_EVP_LIB);
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
+
+		if (EVP_VerifyFinal(&mctx, p , i, pkey) <= 0)
+			{
+			al=SSL_AD_DECRYPT_ERROR;
+			SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_BAD_SIGNATURE);
+			goto f_err;
+			}
+		}
+	else
 #ifndef OPENSSL_NO_RSA 
 	if (pkey->type == EVP_PKEY_RSA)
 		{
@@ -2879,6 +3141,13 @@ f_err:
 		ssl3_send_alert(s,SSL3_AL_FATAL,al);
 		}
 end:
+	if (s->s3->handshake_buffer)
+		{
+		BIO_free(s->s3->handshake_buffer);
+		s->s3->handshake_buffer = NULL;
+		s->s3->flags &= ~TLS1_FLAGS_KEEP_HANDSHAKE;
+		}
+	EVP_MD_CTX_cleanup(&mctx);
 	EVP_PKEY_free(pkey);
 	return(ret);
 	}
@@ -2991,6 +3260,12 @@ int ssl3_get_client_certificate(SSL *s)
 			al=SSL_AD_HANDSHAKE_FAILURE;
 			goto f_err;
 			}
+		/* No client certificate so digest cached records */
+		if (s->s3->handshake_buffer && !ssl3_digest_cached_records(s))
+			{
+			al=SSL_AD_INTERNAL_ERROR;
+			goto f_err;
+			}
 		}
 	else
 		{
@@ -3067,13 +3342,17 @@ int ssl3_send_server_certificate(SSL *s)
 	/* SSL3_ST_SW_CERT_B */
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 	}
+
 #ifndef OPENSSL_NO_TLSEXT
+/* send a new session ticket (not necessarily for a new session) */
 int ssl3_send_newsession_ticket(SSL *s)
 	{
 	if (s->state == SSL3_ST_SW_SESSION_TICKET_A)
 		{
 		unsigned char *p, *senc, *macstart;
-		int len, slen;
+		const unsigned char *const_p;
+		int len, slen_full, slen;
+		SSL_SESSION *sess;
 		unsigned int hlen;
 		EVP_CIPHER_CTX ctx;
 		HMAC_CTX hctx;
@@ -3082,12 +3361,38 @@ int ssl3_send_newsession_ticket(SSL *s)
 		unsigned char key_name[16];
 
 		/* get session encoding length */
-		slen = i2d_SSL_SESSION(s->session, NULL);
+		slen_full = i2d_SSL_SESSION(s->session, NULL);
 		/* Some length values are 16 bits, so forget it if session is
  		 * too long
  		 */
-		if (slen > 0xFF00)
+		if (slen_full > 0xFF00)
+			return -1;
+		senc = OPENSSL_malloc(slen_full);
+		if (!senc)
+			return -1;
+		p = senc;
+		i2d_SSL_SESSION(s->session, &p);
+
+		/* create a fresh copy (not shared with other threads) to clean up */
+		const_p = senc;
+		sess = d2i_SSL_SESSION(NULL, &const_p, slen_full);
+		if (sess == NULL)
+			{
+			OPENSSL_free(senc);
 			return -1;
+			}
+		sess->session_id_length = 0; /* ID is irrelevant for the ticket */
+
+		slen = i2d_SSL_SESSION(sess, NULL);
+		if (slen > slen_full) /* shouldn't ever happen */
+			{
+			OPENSSL_free(senc);
+			return -1;
+			}
+		p = senc;
+		i2d_SSL_SESSION(sess, &p);
+		SSL_SESSION_free(sess);
+
 		/* Grow buffer if need be: the length calculation is as
  		 * follows 1 (size of message name) + 3 (message length
  		 * bytes) + 4 (ticket lifetime hint) + 2 (ticket length) +
@@ -3099,11 +3404,6 @@ int ssl3_send_newsession_ticket(SSL *s)
 			26 + EVP_MAX_IV_LENGTH + EVP_MAX_BLOCK_LENGTH +
 			EVP_MAX_MD_SIZE + slen))
 			return -1;
-		senc = OPENSSL_malloc(slen);
-		if (!senc)
-			return -1;
-		p = senc;
-		i2d_SSL_SESSION(s->session, &p);
 
 		p=(unsigned char *)s->init_buf->data;
 		/* do the header */
@@ -3134,7 +3434,13 @@ int ssl3_send_newsession_ticket(SSL *s)
 					tlsext_tick_md(), NULL);
 			memcpy(key_name, tctx->tlsext_tick_key_name, 16);
 			}
-		l2n(s->session->tlsext_tick_lifetime_hint, p);
+
+		/* Ticket lifetime hint (advisory only):
+		 * We leave this unspecified for resumed session (for simplicity),
+		 * and guess that tickets for new sessions will live as long
+		 * as their sessions. */
+		l2n(s->hit ? 0 : s->session->timeout, p);
+
 		/* Skip ticket length for now */
 		p += 2;
 		/* Output key name */
@@ -3209,4 +3515,72 @@ int ssl3_send_cert_status(SSL *s)
 	/* SSL3_ST_SW_CERT_STATUS_B */
 	return(ssl3_do_write(s,SSL3_RT_HANDSHAKE));
 	}
+
+# ifndef OPENSSL_NO_NEXTPROTONEG
+/* ssl3_get_next_proto reads a Next Protocol Negotiation handshake message. It
+ * sets the next_proto member in s if found */
+int ssl3_get_next_proto(SSL *s)
+	{
+	int ok;
+	int proto_len, padding_len;
+	long n;
+	const unsigned char *p;
+
+	/* Clients cannot send a NextProtocol message if we didn't see the
+	 * extension in their ClientHello */
+	if (!s->s3->next_proto_neg_seen)
+		{
+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION);
+		return -1;
+		}
+
+	n=s->method->ssl_get_message(s,
+		SSL3_ST_SR_NEXT_PROTO_A,
+		SSL3_ST_SR_NEXT_PROTO_B,
+		SSL3_MT_NEXT_PROTO,
+		514,  /* See the payload format below */
+		&ok);
+
+	if (!ok)
+		return((int)n);
+
+	/* s->state doesn't reflect whether ChangeCipherSpec has been received
+	 * in this handshake, but s->s3->change_cipher_spec does (will be reset
+	 * by ssl3_get_finished). */
+	if (!s->s3->change_cipher_spec)
+		{
+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS);
+		return -1;
+		}
+
+	if (n < 2)
+		return 0;  /* The body must be > 1 bytes long */
+
+	p=(unsigned char *)s->init_msg;
+
+	/* The payload looks like:
+	 *   uint8 proto_len;
+	 *   uint8 proto[proto_len];
+	 *   uint8 padding_len;
+	 *   uint8 padding[padding_len];
+	 */
+	proto_len = p[0];
+	if (proto_len + 2 > s->init_num)
+		return 0;
+	padding_len = p[proto_len + 1];
+	if (proto_len + padding_len + 2 != s->init_num)
+		return 0;
+
+	s->next_proto_negotiated = OPENSSL_malloc(proto_len);
+	if (!s->next_proto_negotiated)
+		{
+		SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,ERR_R_MALLOC_FAILURE);
+		return 0;
+		}
+	memcpy(s->next_proto_negotiated, p + 1, proto_len);
+	s->next_proto_negotiated_len = proto_len;
+
+	return 1;
+	}
+# endif
 #endif
diff --git a/src/lib/libssl/srtp.h b/src/lib/libssl/srtp.h
new file mode 100644
index 0000000000..c0cf33ef28
--- /dev/null
+++ b/src/lib/libssl/srtp.h
@@ -0,0 +1,145 @@
+/* ssl/tls1.h */
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/*
+  DTLS code by Eric Rescorla <ekr@rtfm.com>
+
+  Copyright (C) 2006, Network Resonance, Inc.
+  Copyright (C) 2011, RTFM, Inc.
+*/
+
+#ifndef HEADER_D1_SRTP_H
+#define HEADER_D1_SRTP_H
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+     
+#define SRTP_AES128_CM_SHA1_80 0x0001
+#define SRTP_AES128_CM_SHA1_32 0x0002
+#define SRTP_AES128_F8_SHA1_80 0x0003
+#define SRTP_AES128_F8_SHA1_32 0x0004
+#define SRTP_NULL_SHA1_80      0x0005
+#define SRTP_NULL_SHA1_32      0x0006
+
+int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles);
+int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl);
+SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/lib/libssl/ssl.h b/src/lib/libssl/ssl.h
index 8f922eea72..8b0c2a2dac 100644
--- a/src/lib/libssl/ssl.h
+++ b/src/lib/libssl/ssl.h
@@ -252,6 +252,7 @@ extern "C" {
 #define SSL_TXT_kEECDH		"kEECDH"
 #define SSL_TXT_kPSK            "kPSK"
 #define SSL_TXT_kGOST		"kGOST"
+#define SSL_TXT_kSRP		"kSRP"
 
 #define	SSL_TXT_aRSA		"aRSA"
 #define	SSL_TXT_aDSS		"aDSS"
@@ -275,6 +276,7 @@ extern "C" {
 #define SSL_TXT_ECDSA		"ECDSA"
 #define SSL_TXT_KRB5      	"KRB5"
 #define SSL_TXT_PSK             "PSK"
+#define SSL_TXT_SRP		"SRP"
 
 #define SSL_TXT_DES		"DES"
 #define SSL_TXT_3DES		"3DES"
@@ -285,6 +287,7 @@ extern "C" {
 #define SSL_TXT_AES128		"AES128"
 #define SSL_TXT_AES256		"AES256"
 #define SSL_TXT_AES		"AES"
+#define SSL_TXT_AES_GCM		"AESGCM"
 #define SSL_TXT_CAMELLIA128	"CAMELLIA128"
 #define SSL_TXT_CAMELLIA256	"CAMELLIA256"
 #define SSL_TXT_CAMELLIA	"CAMELLIA"
@@ -294,10 +297,14 @@ extern "C" {
 #define SSL_TXT_SHA		"SHA" /* same as "SHA1" */
 #define SSL_TXT_GOST94		"GOST94" 
 #define SSL_TXT_GOST89MAC		"GOST89MAC" 
+#define SSL_TXT_SHA256		"SHA256"
+#define SSL_TXT_SHA384		"SHA384"
 
 #define SSL_TXT_SSLV2		"SSLv2"
 #define SSL_TXT_SSLV3		"SSLv3"
 #define SSL_TXT_TLSV1		"TLSv1"
+#define SSL_TXT_TLSV1_1		"TLSv1.1"
+#define SSL_TXT_TLSV1_2		"TLSv1.2"
 
 #define SSL_TXT_EXP		"EXP"
 #define SSL_TXT_EXPORT		"EXPORT"
@@ -356,9 +363,29 @@ extern "C" {
  * in SSL_CTX. */
 typedef struct ssl_st *ssl_crock_st;
 typedef struct tls_session_ticket_ext_st TLS_SESSION_TICKET_EXT;
+typedef struct ssl_method_st SSL_METHOD;
+typedef struct ssl_cipher_st SSL_CIPHER;
+typedef struct ssl_session_st SSL_SESSION;
+
+DECLARE_STACK_OF(SSL_CIPHER)
+
+/* SRTP protection profiles for use with the use_srtp extension (RFC 5764)*/
+typedef struct srtp_protection_profile_st
+       {
+       const char *name;
+       unsigned long id;
+       } SRTP_PROTECTION_PROFILE;
+
+DECLARE_STACK_OF(SRTP_PROTECTION_PROFILE)
+
+typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
+typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
+
+
+#ifndef OPENSSL_NO_SSL_INTERN
 
 /* used to hold info on the particular ciphers used */
-typedef struct ssl_cipher_st
+struct ssl_cipher_st
 	{
 	int valid;
 	const char *name;		/* text name */
@@ -375,15 +402,11 @@ typedef struct ssl_cipher_st
 	unsigned long algorithm2;	/* Extra flags */
 	int strength_bits;		/* Number of bits really used */
 	int alg_bits;			/* Number of bits for algorithm */
-	} SSL_CIPHER;
-
-DECLARE_STACK_OF(SSL_CIPHER)
+	};
 
-typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg);
-typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg);
 
 /* Used to hold functions for SSLv2 or SSLv3/TLSv1 functions */
-typedef struct ssl_method_st
+struct ssl_method_st
 	{
 	int version;
 	int (*ssl_new)(SSL *s);
@@ -416,7 +439,7 @@ typedef struct ssl_method_st
 	int (*ssl_version)(void);
 	long (*ssl_callback_ctrl)(SSL *s, int cb_id, void (*fp)(void));
 	long (*ssl_ctx_callback_ctrl)(SSL_CTX *s, int cb_id, void (*fp)(void));
-	} SSL_METHOD;
+	};
 
 /* Lets make this into an ASN.1 type structure as follows
  * SSL_SESSION_ID ::= SEQUENCE {
@@ -433,14 +456,17 @@ typedef struct ssl_method_st
  *	Session_ID_context [ 4 ] EXPLICIT OCTET STRING,   -- the Session ID context
  *	Verify_result [ 5 ] EXPLICIT INTEGER,   -- X509_V_... code for `Peer'
  *	HostName [ 6 ] EXPLICIT OCTET STRING,   -- optional HostName from servername TLS extension 
- *	ECPointFormatList [ 7 ] OCTET STRING,     -- optional EC point format list from TLS extension
- *	PSK_identity_hint [ 8 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
- *	PSK_identity [ 9 ] EXPLICIT OCTET STRING -- optional PSK identity
+ *	PSK_identity_hint [ 7 ] EXPLICIT OCTET STRING, -- optional PSK identity hint
+ *	PSK_identity [ 8 ] EXPLICIT OCTET STRING,  -- optional PSK identity
+ *	Ticket_lifetime_hint [9] EXPLICIT INTEGER, -- server's lifetime hint for session ticket
+ *	Ticket [10]             EXPLICIT OCTET STRING, -- session ticket (clients only)
+ *	Compression_meth [11]   EXPLICIT OCTET STRING, -- optional compression method
+ *	SRP_username [ 12 ] EXPLICIT OCTET STRING -- optional SRP username
  *	}
  * Look in ssl/ssl_asn1.c for more details
  * I'm using EXPLICIT tags so I can read the damn things using asn1parse :-).
  */
-typedef struct ssl_session_st
+struct ssl_session_st
 	{
 	int ssl_version;	/* what ssl version session info is
 				 * being kept in here? */
@@ -512,8 +538,12 @@ typedef struct ssl_session_st
 	size_t	tlsext_ticklen;		/* Session ticket length */	
 	long tlsext_tick_lifetime_hint;	/* Session lifetime hint in seconds */
 #endif
-	} SSL_SESSION;
+#ifndef OPENSSL_NO_SRP
+	char *srp_username;
+#endif
+	};
 
+#endif
 
 #define SSL_OP_MICROSOFT_SESS_ID_BUG			0x00000001L
 #define SSL_OP_NETSCAPE_CHALLENGE_BUG			0x00000002L
@@ -536,7 +566,7 @@ typedef struct ssl_session_st
 
 /* SSL_OP_ALL: various bug workarounds that should be rather harmless.
  *             This used to be 0x000FFFFFL before 0.9.7. */
-#define SSL_OP_ALL					0x80000FFFL
+#define SSL_OP_ALL					0x80000BFFL
 
 /* DTLS options */
 #define SSL_OP_NO_QUERY_MTU                 0x00001000L
@@ -572,11 +602,17 @@ typedef struct ssl_session_st
 #define SSL_OP_NO_SSLv2					0x01000000L
 #define SSL_OP_NO_SSLv3					0x02000000L
 #define SSL_OP_NO_TLSv1					0x04000000L
+#define SSL_OP_NO_TLSv1_2				0x08000000L
+#define SSL_OP_NO_TLSv1_1				0x10000000L
 
+/* These next two were never actually used for anything since SSLeay
+ * zap so we have some more flags.
+ */
 /* The next flag deliberately changes the ciphertest, this is a check
  * for the PKCS#1 attack */
-#define SSL_OP_PKCS1_CHECK_1				0x08000000L
-#define SSL_OP_PKCS1_CHECK_2				0x10000000L
+#define SSL_OP_PKCS1_CHECK_1				0x0
+#define SSL_OP_PKCS1_CHECK_2				0x0
+
 #define SSL_OP_NETSCAPE_CA_DN_BUG			0x20000000L
 #define SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG		0x40000000L
 /* Make server add server-hello extension from early version of
@@ -637,12 +673,53 @@ typedef struct ssl_session_st
 #define SSL_get_secure_renegotiation_support(ssl) \
 	SSL_ctrl((ssl), SSL_CTRL_GET_RI_SUPPORT, 0, NULL)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_heartbeat(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_TLS_EXT_SEND_HEARTBEAT,0,NULL)
+#endif
+
 void SSL_CTX_set_msg_callback(SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg));
 #define SSL_CTX_set_msg_callback_arg(ctx, arg) SSL_CTX_ctrl((ctx), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 #define SSL_set_msg_callback_arg(ssl, arg) SSL_ctrl((ssl), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg))
 
+#ifndef OPENSSL_NO_SRP
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
+typedef struct srp_ctx_st
+	{
+	/* param for all the callbacks */
+	void *SRP_cb_arg;
+	/* set client Hello login callback */
+	int (*TLS_ext_srp_username_callback)(SSL *, int *, void *);
+	/* set SRP N/g param callback for verification */
+	int (*SRP_verify_param_callback)(SSL *, void *);
+	/* set SRP client passwd callback */
+	char *(*SRP_give_srp_client_pwd_callback)(SSL *, void *);
+
+	char *login;
+	BIGNUM *N,*g,*s,*B,*A;
+	BIGNUM *a,*b,*v;
+	char *info;
+	int strength;
+
+	unsigned long srp_Mask;
+	} SRP_CTX;
+
+#endif
+
+/* see tls_srp.c */
+int SSL_SRP_CTX_init(SSL *s);
+int SSL_CTX_SRP_CTX_init(SSL_CTX *ctx);
+int SSL_SRP_CTX_free(SSL *ctx);
+int SSL_CTX_SRP_CTX_free(SSL_CTX *ctx);
+int SSL_srp_server_param_with_username(SSL *s, int *ad);
+int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key);
+int SRP_Calc_A_param(SSL *s);
+int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key);
+
+#endif
 
 #if defined(OPENSSL_SYS_MSDOS) && !defined(OPENSSL_SYS_WIN32)
 #define SSL_MAX_CERT_LIST_DEFAULT 1024*30 /* 30k max cert list :-) */
@@ -668,7 +745,11 @@ void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int con
 typedef int (*GEN_SESSION_CB)(const SSL *ssl, unsigned char *id,
 				unsigned int *id_len);
 
-typedef struct ssl_comp_st
+typedef struct ssl_comp_st SSL_COMP;
+
+#ifndef OPENSSL_NO_SSL_INTERN
+
+struct ssl_comp_st
 	{
 	int id;
 	const char *name;
@@ -677,7 +758,7 @@ typedef struct ssl_comp_st
 #else
 	char *method;
 #endif
-	} SSL_COMP;
+	};
 
 DECLARE_STACK_OF(SSL_COMP)
 DECLARE_LHASH_OF(SSL_SESSION);
@@ -846,7 +927,6 @@ struct ssl_ctx_st
 	/* Callback for status request */
 	int (*tlsext_status_cb)(SSL *ssl, void *arg);
 	void *tlsext_status_arg;
-
 	/* draft-rescorla-tls-opaque-prf-input-00.txt information */
 	int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg);
 	void *tlsext_opaque_prf_input_callback_arg;
@@ -866,9 +946,37 @@ struct ssl_ctx_st
 	unsigned int freelist_max_len;
 	struct ssl3_buf_freelist_st *wbuf_freelist;
 	struct ssl3_buf_freelist_st *rbuf_freelist;
+#endif
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
+
+#ifndef OPENSSL_NO_TLSEXT
+# ifndef OPENSSL_NO_NEXTPROTONEG
+	/* Next protocol negotiation information */
+	/* (for experimental NPN extension). */
+
+	/* For a server, this contains a callback function by which the set of
+	 * advertised protocols can be provided. */
+	int (*next_protos_advertised_cb)(SSL *s, const unsigned char **buf,
+			                 unsigned int *len, void *arg);
+	void *next_protos_advertised_cb_arg;
+	/* For a client, this contains a callback function that selects the
+	 * next protocol from the list provided by the server. */
+	int (*next_proto_select_cb)(SSL *s, unsigned char **out,
+				    unsigned char *outlen,
+				    const unsigned char *in,
+				    unsigned int inlen,
+				    void *arg);
+	void *next_proto_select_cb_arg;
+# endif
+        /* SRTP profiles we are willing to do from RFC 5764 */
+        STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  
 #endif
 	};
 
+#endif
+
 #define SSL_SESS_CACHE_OFF			0x0000
 #define SSL_SESS_CACHE_CLIENT			0x0001
 #define SSL_SESS_CACHE_SERVER			0x0002
@@ -921,6 +1029,32 @@ int SSL_CTX_set_client_cert_engine(SSL_CTX *ctx, ENGINE *e);
 #endif
 void SSL_CTX_set_cookie_generate_cb(SSL_CTX *ctx, int (*app_gen_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int *cookie_len));
 void SSL_CTX_set_cookie_verify_cb(SSL_CTX *ctx, int (*app_verify_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int cookie_len));
+#ifndef OPENSSL_NO_NEXTPROTONEG
+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s,
+					   int (*cb) (SSL *ssl,
+						      const unsigned char **out,
+						      unsigned int *outlen,
+						      void *arg),
+					   void *arg);
+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s,
+				      int (*cb) (SSL *ssl,
+						 unsigned char **out,
+						 unsigned char *outlen,
+						 const unsigned char *in,
+						 unsigned int inlen,
+						 void *arg),
+				      void *arg);
+
+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen,
+			  const unsigned char *in, unsigned int inlen,
+			  const unsigned char *client, unsigned int client_len);
+void SSL_get0_next_proto_negotiated(const SSL *s,
+				    const unsigned char **data, unsigned *len);
+
+#define OPENSSL_NPN_UNSUPPORTED	0
+#define OPENSSL_NPN_NEGOTIATED	1
+#define OPENSSL_NPN_NO_OVERLAP	2
+#endif
 
 #ifndef OPENSSL_NO_PSK
 /* the maximum length of the buffer given to callbacks containing the
@@ -961,6 +1095,8 @@ const char *SSL_get_psk_identity(const SSL *s);
 #define SSL_MAC_FLAG_READ_MAC_STREAM 1
 #define SSL_MAC_FLAG_WRITE_MAC_STREAM 2
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 struct ssl_st
 	{
 	/* protocol version
@@ -1005,9 +1141,7 @@ struct ssl_st
 
 	int server;	/* are we the server side? - mostly used by SSL_clear*/
 
-	int new_session;/* 1 if we are to use a new session.
-	                 * 2 if we are a server and are inside a handshake
-	                 *   (i.e. not just sending a HelloRequest)
+	int new_session;/* Generate a new session or reuse an old one.
 	                 * NB: For servers, the 'new' session may actually be a previously
 	                 * cached session or even the previous session unless
 	                 * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */
@@ -1177,12 +1311,46 @@ struct ssl_st
 	void *tls_session_secret_cb_arg;
 
 	SSL_CTX * initial_ctx; /* initial ctx, used to store sessions */
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	/* Next protocol negotiation. For the client, this is the protocol that
+	 * we sent in NextProtocol and is set when handling ServerHello
+	 * extensions.
+	 *
+	 * For a server, this is the client's selected_protocol from
+	 * NextProtocol and is set when handling the NextProtocol message,
+	 * before the Finished message. */
+	unsigned char *next_proto_negotiated;
+	unsigned char next_proto_negotiated_len;
+#endif
+
 #define session_ctx initial_ctx
+
+	STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles;  /* What we'll do */
+	SRTP_PROTECTION_PROFILE *srtp_profile;            /* What's been chosen */
+
+	unsigned int tlsext_heartbeat;  /* Is use of the Heartbeat extension negotiated?
+	                                   0: disabled
+	                                   1: enabled
+	                                   2: enabled, but not allowed to send Requests
+	                                 */
+	unsigned int tlsext_hb_pending; /* Indicates if a HeartbeatRequest is in flight */
+	unsigned int tlsext_hb_seq;     /* HeartbeatRequest sequence number */
 #else
 #define session_ctx ctx
 #endif /* OPENSSL_NO_TLSEXT */
+
+	int renegotiate;/* 1 if we are renegotiating.
+	                 * 2 if we are a server and are inside a handshake
+	                 * (i.e. not just sending a HelloRequest) */
+
+#ifndef OPENSSL_NO_SRP
+	SRP_CTX srp_ctx; /* ctx for SRP authentication */
+#endif
 	};
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif
@@ -1192,6 +1360,7 @@ struct ssl_st
 #include <openssl/tls1.h> /* This is mostly sslv3 with a few tweaks */
 #include <openssl/dtls1.h> /* Datagram TLS */
 #include <openssl/ssl23.h>
+#include <openssl/srtp.h>  /* Support for the use_srtp extension */
 
 #ifdef  __cplusplus
 extern "C" {
@@ -1408,6 +1577,20 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_CTRL_SET_TLSEXT_STATUS_REQ_OCSP_RESP	71
 
 #define SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB	72
+
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB	75
+#define SSL_CTRL_SET_SRP_VERIFY_PARAM_CB		76
+#define SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB		77
+
+#define SSL_CTRL_SET_SRP_ARG		78
+#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME		79
+#define SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH		80
+#define SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD		81
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_CTRL_TLS_EXT_SEND_HEARTBEAT				85
+#define SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING		86
+#define SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS	87
+#endif
 #endif
 
 #define DTLS_CTRL_GET_TIMEOUT		73
@@ -1418,6 +1601,9 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 #define SSL_CTRL_CLEAR_OPTIONS			77
 #define SSL_CTRL_CLEAR_MODE			78
 
+#define SSL_CTRL_GET_EXTRA_CHAIN_CERTS		82
+#define SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS	83
+
 #define DTLSv1_get_timeout(ssl, arg) \
 	SSL_ctrl(ssl,DTLS_CTRL_GET_TIMEOUT,0, (void *)arg)
 #define DTLSv1_handle_timeout(ssl) \
@@ -1454,6 +1640,10 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION)
 
 #define SSL_CTX_add_extra_chain_cert(ctx,x509) \
 	SSL_CTX_ctrl(ctx,SSL_CTRL_EXTRA_CHAIN_CERT,0,(char *)x509)
+#define SSL_CTX_get_extra_chain_certs(ctx,px509) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_GET_EXTRA_CHAIN_CERTS,0,px509)
+#define SSL_CTX_clear_extra_chain_certs(ctx) \
+	SSL_CTX_ctrl(ctx,SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS,0,NULL)
 
 #ifndef OPENSSL_NO_BIO
 BIO_METHOD *BIO_f_ssl(void);
@@ -1481,6 +1671,7 @@ const SSL_CIPHER *SSL_get_current_cipher(const SSL *s);
 int	SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits);
 char *	SSL_CIPHER_get_version(const SSL_CIPHER *c);
 const char *	SSL_CIPHER_get_name(const SSL_CIPHER *c);
+unsigned long 	SSL_CIPHER_get_id(const SSL_CIPHER *c);
 
 int	SSL_get_fd(const SSL *s);
 int	SSL_get_rfd(const SSL *s);
@@ -1546,10 +1737,14 @@ long	SSL_SESSION_set_time(SSL_SESSION *s, long t);
 long	SSL_SESSION_get_timeout(const SSL_SESSION *s);
 long	SSL_SESSION_set_timeout(SSL_SESSION *s, long t);
 void	SSL_copy_session_id(SSL *to,const SSL *from);
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s);
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len);
 
 SSL_SESSION *SSL_SESSION_new(void);
 const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s,
 					unsigned int *len);
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s);
 #ifndef OPENSSL_NO_FP_API
 int	SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses);
 #endif
@@ -1612,6 +1807,30 @@ int SSL_set_trust(SSL *s, int trust);
 int SSL_CTX_set1_param(SSL_CTX *ctx, X509_VERIFY_PARAM *vpm);
 int SSL_set1_param(SSL *ssl, X509_VERIFY_PARAM *vpm);
 
+#ifndef OPENSSL_NO_SRP
+int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name);
+int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password);
+int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength);
+int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx,
+					char *(*cb)(SSL *,void *));
+int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx,
+					  int (*cb)(SSL *,void *));
+int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx,
+				      int (*cb)(SSL *,int *,void *));
+int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg);
+
+int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g,
+			     BIGNUM *sa, BIGNUM *v, char *info);
+int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass,
+				const char *grp);
+
+BIGNUM *SSL_get_srp_g(SSL *s);
+BIGNUM *SSL_get_srp_N(SSL *s);
+
+char *SSL_get_srp_username(SSL *s);
+char *SSL_get_srp_userinfo(SSL *s);
+#endif
+
 void	SSL_free(SSL *ssl);
 int 	SSL_accept(SSL *ssl);
 int 	SSL_connect(SSL *ssl);
@@ -1647,6 +1866,15 @@ const SSL_METHOD *TLSv1_method(void);		/* TLSv1.0 */
 const SSL_METHOD *TLSv1_server_method(void);	/* TLSv1.0 */
 const SSL_METHOD *TLSv1_client_method(void);	/* TLSv1.0 */
 
+const SSL_METHOD *TLSv1_1_method(void);		/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_server_method(void);	/* TLSv1.1 */
+const SSL_METHOD *TLSv1_1_client_method(void);	/* TLSv1.1 */
+
+const SSL_METHOD *TLSv1_2_method(void);		/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_server_method(void);	/* TLSv1.2 */
+const SSL_METHOD *TLSv1_2_client_method(void);	/* TLSv1.2 */
+
+
 const SSL_METHOD *DTLSv1_method(void);		/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_server_method(void);	/* DTLSv1.0 */
 const SSL_METHOD *DTLSv1_client_method(void);	/* DTLSv1.0 */
@@ -1655,6 +1883,7 @@ STACK_OF(SSL_CIPHER) *SSL_get_ciphers(const SSL *s);
 
 int SSL_do_handshake(SSL *s);
 int SSL_renegotiate(SSL *s);
+int SSL_renegotiate_abbreviated(SSL *s);
 int SSL_renegotiate_pending(SSL *s);
 int SSL_shutdown(SSL *s);
 
@@ -1706,6 +1935,7 @@ void SSL_set_info_callback(SSL *ssl,
 			   void (*cb)(const SSL *ssl,int type,int val));
 void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl,int type,int val);
 int SSL_state(const SSL *ssl);
+void SSL_set_state(SSL *ssl, int state);
 
 void SSL_set_verify_result(SSL *ssl,long v);
 long SSL_get_verify_result(const SSL *ssl);
@@ -1806,6 +2036,9 @@ int SSL_set_session_ticket_ext_cb(SSL *s, tls_session_ticket_ext_cb_fn cb,
 /* Pre-shared secret session resumption functions */
 int SSL_set_session_secret_cb(SSL *s, tls_session_secret_cb_fn tls_session_secret_cb, void *arg);
 
+void SSL_set_debug(SSL *s, int debug);
+int SSL_cache_hit(SSL *s);
+
 /* BEGIN ERROR CODES */
 /* The following lines are auto generated by the script mkerr.pl. Any changes
  * made after this point may be overwritten when the script is next run.
@@ -1825,6 +2058,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_DTLS1_ACCEPT				 246
 #define SSL_F_DTLS1_ADD_CERT_TO_BUF			 295
 #define SSL_F_DTLS1_BUFFER_RECORD			 247
+#define SSL_F_DTLS1_CHECK_TIMEOUT_NUM			 316
 #define SSL_F_DTLS1_CLIENT_HELLO			 248
 #define SSL_F_DTLS1_CONNECT				 249
 #define SSL_F_DTLS1_ENC					 250
@@ -1833,6 +2067,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_DTLS1_GET_MESSAGE_FRAGMENT		 253
 #define SSL_F_DTLS1_GET_RECORD				 254
 #define SSL_F_DTLS1_HANDLE_TIMEOUT			 297
+#define SSL_F_DTLS1_HEARTBEAT				 305
 #define SSL_F_DTLS1_OUTPUT_CERT_CHAIN			 255
 #define SSL_F_DTLS1_PREPROCESS_FRAGMENT			 288
 #define SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE		 256
@@ -1901,6 +2136,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_GET_KEY_EXCHANGE			 141
 #define SSL_F_SSL3_GET_MESSAGE				 142
 #define SSL_F_SSL3_GET_NEW_SESSION_TICKET		 283
+#define SSL_F_SSL3_GET_NEXT_PROTO			 306
 #define SSL_F_SSL3_GET_RECORD				 143
 #define SSL_F_SSL3_GET_SERVER_CERTIFICATE		 144
 #define SSL_F_SSL3_GET_SERVER_DONE			 145
@@ -1925,10 +2161,12 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL3_WRITE_PENDING			 159
 #define SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT	 298
 #define SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT		 277
+#define SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT		 307
 #define SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK	 215
 #define SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK	 216
 #define SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT	 299
 #define SSL_F_SSL_ADD_SERVERHELLO_TLSEXT		 278
+#define SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT		 308
 #define SSL_F_SSL_BAD_METHOD				 160
 #define SSL_F_SSL_BYTES_TO_CIPHER_LIST			 161
 #define SSL_F_SSL_CERT_DUP				 221
@@ -1945,6 +2183,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_CREATE_CIPHER_LIST			 166
 #define SSL_F_SSL_CTRL					 232
 #define SSL_F_SSL_CTX_CHECK_PRIVATE_KEY			 168
+#define SSL_F_SSL_CTX_MAKE_PROFILES			 309
 #define SSL_F_SSL_CTX_NEW				 169
 #define SSL_F_SSL_CTX_SET_CIPHER_LIST			 269
 #define SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE		 290
@@ -1973,8 +2212,10 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_NEW					 186
 #define SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT	 300
 #define SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT		 302
+#define SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT	 310
 #define SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT	 301
 #define SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT		 303
+#define SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT	 311
 #define SSL_F_SSL_PEEK					 270
 #define SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT		 281
 #define SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT		 282
@@ -1983,6 +2224,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_RSA_PUBLIC_ENCRYPT			 188
 #define SSL_F_SSL_SESSION_NEW				 189
 #define SSL_F_SSL_SESSION_PRINT_FP			 190
+#define SSL_F_SSL_SESSION_SET1_ID_CONTEXT		 312
 #define SSL_F_SSL_SESS_CERT_NEW				 225
 #define SSL_F_SSL_SET_CERT				 191
 #define SSL_F_SSL_SET_CIPHER_LIST			 271
@@ -1996,6 +2238,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_SSL_SET_TRUST				 228
 #define SSL_F_SSL_SET_WFD				 196
 #define SSL_F_SSL_SHUTDOWN				 224
+#define SSL_F_SSL_SRP_CTX_INIT				 313
 #define SSL_F_SSL_UNDEFINED_CONST_FUNCTION		 243
 #define SSL_F_SSL_UNDEFINED_FUNCTION			 197
 #define SSL_F_SSL_UNDEFINED_VOID_FUNCTION		 244
@@ -2015,6 +2258,8 @@ void ERR_load_SSL_strings(void);
 #define SSL_F_TLS1_CHANGE_CIPHER_STATE			 209
 #define SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT		 274
 #define SSL_F_TLS1_ENC					 210
+#define SSL_F_TLS1_EXPORT_KEYING_MATERIAL		 314
+#define SSL_F_TLS1_HEARTBEAT				 315
 #define SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT		 275
 #define SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT		 276
 #define SSL_F_TLS1_PRF					 284
@@ -2054,6 +2299,13 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_BAD_RSA_MODULUS_LENGTH			 121
 #define SSL_R_BAD_RSA_SIGNATURE				 122
 #define SSL_R_BAD_SIGNATURE				 123
+#define SSL_R_BAD_SRP_A_LENGTH				 347
+#define SSL_R_BAD_SRP_B_LENGTH				 348
+#define SSL_R_BAD_SRP_G_LENGTH				 349
+#define SSL_R_BAD_SRP_N_LENGTH				 350
+#define SSL_R_BAD_SRP_S_LENGTH				 351
+#define SSL_R_BAD_SRTP_MKI_VALUE			 352
+#define SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST		 353
 #define SSL_R_BAD_SSL_FILETYPE				 124
 #define SSL_R_BAD_SSL_SESSION_ID_LENGTH			 125
 #define SSL_R_BAD_STATE					 126
@@ -2092,12 +2344,15 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE	 322
 #define SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE	 323
 #define SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER		 310
+#define SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST	 354
 #define SSL_R_ENCRYPTED_LENGTH_TOO_LONG			 150
 #define SSL_R_ERROR_GENERATING_TMP_RSA_KEY		 282
 #define SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST		 151
 #define SSL_R_EXCESSIVE_MESSAGE_SIZE			 152
 #define SSL_R_EXTRA_DATA_IN_MESSAGE			 153
 #define SSL_R_GOT_A_FIN_BEFORE_A_CCS			 154
+#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS		 355
+#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION		 356
 #define SSL_R_HTTPS_PROXY_REQUEST			 155
 #define SSL_R_HTTP_REQUEST				 156
 #define SSL_R_ILLEGAL_PADDING				 283
@@ -2106,6 +2361,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_INVALID_COMMAND				 280
 #define SSL_R_INVALID_COMPRESSION_ALGORITHM		 341
 #define SSL_R_INVALID_PURPOSE				 278
+#define SSL_R_INVALID_SRP_USERNAME			 357
 #define SSL_R_INVALID_STATUS_RESPONSE			 328
 #define SSL_R_INVALID_TICKET_KEYS_LENGTH		 325
 #define SSL_R_INVALID_TRUST				 279
@@ -2135,6 +2391,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_MISSING_RSA_CERTIFICATE			 168
 #define SSL_R_MISSING_RSA_ENCRYPTING_CERT		 169
 #define SSL_R_MISSING_RSA_SIGNING_CERT			 170
+#define SSL_R_MISSING_SRP_PARAM				 358
 #define SSL_R_MISSING_TMP_DH_KEY			 171
 #define SSL_R_MISSING_TMP_ECDH_KEY			 311
 #define SSL_R_MISSING_TMP_RSA_KEY			 172
@@ -2164,6 +2421,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_NO_RENEGOTIATION				 339
 #define SSL_R_NO_REQUIRED_DIGEST			 324
 #define SSL_R_NO_SHARED_CIPHER				 193
+#define SSL_R_NO_SRTP_PROFILES				 359
 #define SSL_R_NO_VERIFY_CALLBACK			 194
 #define SSL_R_NULL_SSL_CTX				 195
 #define SSL_R_NULL_SSL_METHOD_PASSED			 196
@@ -2207,7 +2465,12 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_SERVERHELLO_TLSEXT			 275
 #define SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED		 277
 #define SSL_R_SHORT_READ				 219
+#define SSL_R_SIGNATURE_ALGORITHMS_ERROR		 360
 #define SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE	 220
+#define SSL_R_SRP_A_CALC				 361
+#define SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES		 362
+#define SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG	 363
+#define SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE		 364
 #define SSL_R_SSL23_DOING_SESSION_ID_REUSE		 221
 #define SSL_R_SSL2_CONNECTION_ID_TOO_LONG		 299
 #define SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT		 321
@@ -2252,6 +2515,9 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_TLSV1_UNRECOGNIZED_NAME			 1112
 #define SSL_R_TLSV1_UNSUPPORTED_EXTENSION		 1110
 #define SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER	 232
+#define SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT		 365
+#define SSL_R_TLS_HEARTBEAT_PENDING			 366
+#define SSL_R_TLS_ILLEGAL_EXPORTER_LABEL		 367
 #define SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST		 157
 #define SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST 233
 #define SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG	 234
@@ -2273,6 +2539,7 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_UNKNOWN_CERTIFICATE_TYPE			 247
 #define SSL_R_UNKNOWN_CIPHER_RETURNED			 248
 #define SSL_R_UNKNOWN_CIPHER_TYPE			 249
+#define SSL_R_UNKNOWN_DIGEST				 368
 #define SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE			 250
 #define SSL_R_UNKNOWN_PKEY_TYPE				 251
 #define SSL_R_UNKNOWN_PROTOCOL				 252
@@ -2287,12 +2554,14 @@ void ERR_load_SSL_strings(void);
 #define SSL_R_UNSUPPORTED_PROTOCOL			 258
 #define SSL_R_UNSUPPORTED_SSL_VERSION			 259
 #define SSL_R_UNSUPPORTED_STATUS_TYPE			 329
+#define SSL_R_USE_SRTP_NOT_NEGOTIATED			 369
 #define SSL_R_WRITE_BIO_NOT_SET				 260
 #define SSL_R_WRONG_CIPHER_RETURNED			 261
 #define SSL_R_WRONG_MESSAGE_TYPE			 262
 #define SSL_R_WRONG_NUMBER_OF_KEY_BITS			 263
 #define SSL_R_WRONG_SIGNATURE_LENGTH			 264
 #define SSL_R_WRONG_SIGNATURE_SIZE			 265
+#define SSL_R_WRONG_SIGNATURE_TYPE			 370
 #define SSL_R_WRONG_SSL_VERSION				 266
 #define SSL_R_WRONG_VERSION_NUMBER			 267
 #define SSL_R_X509_LIB					 268
diff --git a/src/lib/libssl/ssl2.h b/src/lib/libssl/ssl2.h
index 99a52ea0dd..eb25dcb0bf 100644
--- a/src/lib/libssl/ssl2.h
+++ b/src/lib/libssl/ssl2.h
@@ -155,6 +155,8 @@ extern "C" {
 #define  CERT		char
 #endif
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl2_state_st
 	{
 	int three_byte_header;
@@ -219,6 +221,8 @@ typedef struct ssl2_state_st
 		} tmp;
 	} SSL2_STATE;
 
+#endif
+
 /* SSLv2 */
 /* client */
 #define SSL2_ST_SEND_CLIENT_HELLO_A		(0x10|SSL_ST_CONNECT)
diff --git a/src/lib/libssl/ssl3.h b/src/lib/libssl/ssl3.h
index 9c2c41287a..112e627de0 100644
--- a/src/lib/libssl/ssl3.h
+++ b/src/lib/libssl/ssl3.h
@@ -322,6 +322,7 @@ extern "C" {
 #define SSL3_RT_ALERT			21
 #define SSL3_RT_HANDSHAKE		22
 #define SSL3_RT_APPLICATION_DATA	23
+#define TLS1_RT_HEARTBEAT		24
 
 #define SSL3_AL_WARNING			1
 #define SSL3_AL_FATAL			2
@@ -339,6 +340,11 @@ extern "C" {
 #define SSL3_AD_CERTIFICATE_UNKNOWN	46
 #define SSL3_AD_ILLEGAL_PARAMETER	47	/* fatal */
 
+#define TLS1_HB_REQUEST		1
+#define TLS1_HB_RESPONSE	2
+	
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_record_st
 	{
 /*r */	int type;               /* type of record */
@@ -360,6 +366,8 @@ typedef struct ssl3_buffer_st
 	int left;               /* how many bytes left */
 	} SSL3_BUFFER;
 
+#endif
+
 #define SSL3_CT_RSA_SIGN			1
 #define SSL3_CT_DSS_SIGN			2
 #define SSL3_CT_RSA_FIXED_DH			3
@@ -379,6 +387,7 @@ typedef struct ssl3_buffer_st
 #define SSL3_FLAGS_POP_BUFFER			0x0004
 #define TLS1_FLAGS_TLS_PADDING_BUG		0x0008
 #define TLS1_FLAGS_SKIP_CERT_VERIFY		0x0010
+#define TLS1_FLAGS_KEEP_HANDSHAKE		0x0020
  
 /* SSL3_FLAGS_SGC_RESTART_DONE is set when we
  * restart a handshake because of MS SGC and so prevents us
@@ -391,6 +400,8 @@ typedef struct ssl3_buffer_st
  */
 #define SSL3_FLAGS_SGC_RESTART_DONE		0x0040
 
+#ifndef OPENSSL_NO_SSL_INTERN
+
 typedef struct ssl3_state_st
 	{
 	long flags;
@@ -475,7 +486,7 @@ typedef struct ssl3_state_st
 		int finish_md_len;
 		unsigned char peer_finish_md[EVP_MAX_MD_SIZE*2];
 		int peer_finish_md_len;
-		
+
 		unsigned long message_size;
 		int message_type;
 
@@ -523,13 +534,23 @@ typedef struct ssl3_state_st
         unsigned char previous_server_finished[EVP_MAX_MD_SIZE];
         unsigned char previous_server_finished_len;
         int send_connection_binding; /* TODOEKR */
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	/* Set if we saw the Next Protocol Negotiation extension from our peer. */
+	int next_proto_neg_seen;
+#endif
 	} SSL3_STATE;
 
+#endif
 
 /* SSLv3 */
 /*client */
 /* extra state */
 #define SSL3_ST_CW_FLUSH		(0x100|SSL_ST_CONNECT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_CW_WRITE_SOCK			(0x310|SSL_ST_CONNECT)
+#define DTLS1_SCTP_ST_CR_READ_SOCK			(0x320|SSL_ST_CONNECT)
+#endif	
 /* write to server */
 #define SSL3_ST_CW_CLNT_HELLO_A		(0x110|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CLNT_HELLO_B		(0x111|SSL_ST_CONNECT)
@@ -557,6 +578,8 @@ typedef struct ssl3_state_st
 #define SSL3_ST_CW_CERT_VRFY_B		(0x191|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CHANGE_A		(0x1A0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_CHANGE_B		(0x1A1|SSL_ST_CONNECT)
+#define SSL3_ST_CW_NEXT_PROTO_A		(0x200|SSL_ST_CONNECT)
+#define SSL3_ST_CW_NEXT_PROTO_B		(0x201|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_A		(0x1B0|SSL_ST_CONNECT)
 #define SSL3_ST_CW_FINISHED_B		(0x1B1|SSL_ST_CONNECT)
 /* read from server */
@@ -572,6 +595,10 @@ typedef struct ssl3_state_st
 /* server */
 /* extra state */
 #define SSL3_ST_SW_FLUSH		(0x100|SSL_ST_ACCEPT)
+#ifndef OPENSSL_NO_SCTP
+#define DTLS1_SCTP_ST_SW_WRITE_SOCK			(0x310|SSL_ST_ACCEPT)
+#define DTLS1_SCTP_ST_SR_READ_SOCK			(0x320|SSL_ST_ACCEPT)
+#endif	
 /* read from client */
 /* Do not change the number values, they do matter */
 #define SSL3_ST_SR_CLNT_HELLO_A		(0x110|SSL_ST_ACCEPT)
@@ -602,6 +629,8 @@ typedef struct ssl3_state_st
 #define SSL3_ST_SR_CERT_VRFY_B		(0x1A1|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_A		(0x1B0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_CHANGE_B		(0x1B1|SSL_ST_ACCEPT)
+#define SSL3_ST_SR_NEXT_PROTO_A		(0x210|SSL_ST_ACCEPT)
+#define SSL3_ST_SR_NEXT_PROTO_B		(0x211|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_A		(0x1C0|SSL_ST_ACCEPT)
 #define SSL3_ST_SR_FINISHED_B		(0x1C1|SSL_ST_ACCEPT)
 /* write to client */
@@ -626,6 +655,7 @@ typedef struct ssl3_state_st
 #define SSL3_MT_CLIENT_KEY_EXCHANGE		16
 #define SSL3_MT_FINISHED			20
 #define SSL3_MT_CERTIFICATE_STATUS		22
+#define SSL3_MT_NEXT_PROTO			67
 #define DTLS1_MT_HELLO_VERIFY_REQUEST    3
 
 
diff --git a/src/lib/libssl/ssl_algs.c b/src/lib/libssl/ssl_algs.c
index 0967b2dfe4..d443143c59 100644
--- a/src/lib/libssl/ssl_algs.c
+++ b/src/lib/libssl/ssl_algs.c
@@ -73,6 +73,9 @@ int SSL_library_init(void)
 #endif
 #ifndef OPENSSL_NO_RC4
 	EVP_add_cipher(EVP_rc4());
+#if !defined(OPENSSL_NO_MD5) && (defined(__x86_64) || defined(__x86_64__))
+	EVP_add_cipher(EVP_rc4_hmac_md5());
+#endif
 #endif  
 #ifndef OPENSSL_NO_RC2
 	EVP_add_cipher(EVP_rc2_cbc());
@@ -85,6 +88,12 @@ int SSL_library_init(void)
 	EVP_add_cipher(EVP_aes_128_cbc());
 	EVP_add_cipher(EVP_aes_192_cbc());
 	EVP_add_cipher(EVP_aes_256_cbc());
+	EVP_add_cipher(EVP_aes_128_gcm());
+	EVP_add_cipher(EVP_aes_256_gcm());
+#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1)
+	EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1());
+	EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1());
+#endif
 #endif
 #ifndef OPENSSL_NO_CAMELLIA
 	EVP_add_cipher(EVP_camellia_128_cbc());
diff --git a/src/lib/libssl/ssl_asn1.c b/src/lib/libssl/ssl_asn1.c
index d7f4c6087e..38540be1e5 100644
--- a/src/lib/libssl/ssl_asn1.c
+++ b/src/lib/libssl/ssl_asn1.c
@@ -114,6 +114,9 @@ typedef struct ssl_session_asn1_st
 	ASN1_OCTET_STRING psk_identity_hint;
 	ASN1_OCTET_STRING psk_identity;
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	ASN1_OCTET_STRING srp_username;
+#endif /* OPENSSL_NO_SRP */
 	} SSL_SESSION_ASN1;
 
 int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
@@ -129,6 +132,9 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 #ifndef OPENSSL_NO_COMP
 	unsigned char cbuf;
 	int v11=0;
+#endif
+#ifndef OPENSSL_NO_SRP
+	int v12=0;
 #endif
 	long l;
 	SSL_SESSION_ASN1 a;
@@ -267,6 +273,14 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 		a.psk_identity.data=(unsigned char *)(in->psk_identity);
 		}
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+		{
+		a.srp_username.length=strlen(in->srp_username);
+		a.srp_username.type=V_ASN1_OCTET_STRING;
+		a.srp_username.data=(unsigned char *)(in->srp_username);
+		}
+#endif /* OPENSSL_NO_SRP */
 
 	M_ASN1_I2D_len(&(a.version),		i2d_ASN1_INTEGER);
 	M_ASN1_I2D_len(&(a.ssl_version),	i2d_ASN1_INTEGER);
@@ -307,6 +321,10 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 	if (in->psk_identity)
         	M_ASN1_I2D_len_EXP_opt(&(a.psk_identity), i2d_ASN1_OCTET_STRING,8,v8);
 #endif /* OPENSSL_NO_PSK */
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+        	M_ASN1_I2D_len_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12);
+#endif /* OPENSSL_NO_SRP */
 
 	M_ASN1_I2D_seq_total();
 
@@ -351,6 +369,10 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp)
 	if (in->compress_meth)
         	M_ASN1_I2D_put_EXP_opt(&(a.comp_id), i2d_ASN1_OCTET_STRING,11,v11);
 #endif
+#ifndef OPENSSL_NO_SRP
+	if (in->srp_username)
+		M_ASN1_I2D_put_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12);
+#endif /* OPENSSL_NO_SRP */
 	M_ASN1_I2D_finish();
 	}
 
@@ -549,6 +571,19 @@ SSL_SESSION *d2i_SSL_SESSION(SSL_SESSION **a, const unsigned char **pp,
 		}
 	else
 		ret->psk_identity_hint=NULL;
+
+	os.length=0;
+	os.data=NULL;
+	M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,8);
+	if (os.data)
+		{
+		ret->psk_identity = BUF_strndup((char *)os.data, os.length);
+		OPENSSL_free(os.data);
+		os.data = NULL;
+		os.length = 0;
+		}
+	else
+		ret->psk_identity=NULL;
 #endif /* OPENSSL_NO_PSK */
 
 #ifndef OPENSSL_NO_TLSEXT
@@ -588,5 +623,20 @@ SSL_SESSION *d2i_SSL_SESSION(SSL_SESSION **a, const unsigned char **pp,
 		}
 #endif
 
+#ifndef OPENSSL_NO_SRP
+	os.length=0;
+	os.data=NULL;
+	M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,12);
+	if (os.data)
+		{
+		ret->srp_username = BUF_strndup((char *)os.data, os.length);
+		OPENSSL_free(os.data);
+		os.data = NULL;
+		os.length = 0;
+		}
+	else
+		ret->srp_username=NULL;
+#endif /* OPENSSL_NO_SRP */
+
 	M_ASN1_D2I_Finish(a,SSL_SESSION_free,SSL_F_D2I_SSL_SESSION);
 	}
diff --git a/src/lib/libssl/ssl_cert.c b/src/lib/libssl/ssl_cert.c
index 27256eea81..917be31876 100644
--- a/src/lib/libssl/ssl_cert.c
+++ b/src/lib/libssl/ssl_cert.c
@@ -160,6 +160,21 @@ int SSL_get_ex_data_X509_STORE_CTX_idx(void)
 	return ssl_x509_store_ctx_idx;
 	}
 
+static void ssl_cert_set_default_md(CERT *cert)
+	{
+	/* Set digest values to defaults */
+#ifndef OPENSSL_NO_DSA
+	cert->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_dss1();
+#endif
+#ifndef OPENSSL_NO_RSA
+	cert->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1();
+	cert->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1();
+#endif
+#ifndef OPENSSL_NO_ECDSA
+	cert->pkeys[SSL_PKEY_ECC].digest = EVP_ecdsa();
+#endif
+	}
+
 CERT *ssl_cert_new(void)
 	{
 	CERT *ret;
@@ -174,7 +189,7 @@ CERT *ssl_cert_new(void)
 
 	ret->key= &(ret->pkeys[SSL_PKEY_RSA_ENC]);
 	ret->references=1;
-
+	ssl_cert_set_default_md(ret);
 	return(ret);
 	}
 
@@ -307,6 +322,10 @@ CERT *ssl_cert_dup(CERT *cert)
 	 * chain is held inside SSL_CTX */
 
 	ret->references=1;
+	/* Set digests to defaults. NB: we don't copy existing values as they
+	 * will be set during handshake.
+	 */
+	ssl_cert_set_default_md(ret);
 
 	return(ret);
 	
diff --git a/src/lib/libssl/ssl_ciph.c b/src/lib/libssl/ssl_ciph.c
index 54ba7ef5b4..92d1e94d6a 100644
--- a/src/lib/libssl/ssl_ciph.c
+++ b/src/lib/libssl/ssl_ciph.c
@@ -162,11 +162,13 @@
 #define SSL_ENC_CAMELLIA256_IDX	9
 #define SSL_ENC_GOST89_IDX	10
 #define SSL_ENC_SEED_IDX    	11
-#define SSL_ENC_NUM_IDX		12
+#define SSL_ENC_AES128GCM_IDX	12
+#define SSL_ENC_AES256GCM_IDX	13
+#define SSL_ENC_NUM_IDX		14
 
 
 static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX]={
-	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,
+	NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL
 	};
 
 #define SSL_COMP_NULL_IDX	0
@@ -179,28 +181,32 @@ static STACK_OF(SSL_COMP) *ssl_comp_methods=NULL;
 #define SSL_MD_SHA1_IDX	1
 #define SSL_MD_GOST94_IDX 2
 #define SSL_MD_GOST89MAC_IDX 3
+#define SSL_MD_SHA256_IDX 4
+#define SSL_MD_SHA384_IDX 5
 /*Constant SSL_MAX_DIGEST equal to size of digests array should be 
  * defined in the
  * ssl_locl.h */
 #define SSL_MD_NUM_IDX	SSL_MAX_DIGEST 
 static const EVP_MD *ssl_digest_methods[SSL_MD_NUM_IDX]={
-	NULL,NULL,NULL,NULL
+	NULL,NULL,NULL,NULL,NULL,NULL
 	};
 /* PKEY_TYPE for GOST89MAC is known in advance, but, because
  * implementation is engine-provided, we'll fill it only if
  * corresponding EVP_PKEY_METHOD is found 
  */
 static int  ssl_mac_pkey_id[SSL_MD_NUM_IDX]={
-	EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef
+	EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef,
+	EVP_PKEY_HMAC,EVP_PKEY_HMAC
 	};
 
 static int ssl_mac_secret_size[SSL_MD_NUM_IDX]={
-	0,0,0,0
+	0,0,0,0,0,0
 	};
 
 static int ssl_handshake_digest_flag[SSL_MD_NUM_IDX]={
 	SSL_HANDSHAKE_MAC_MD5,SSL_HANDSHAKE_MAC_SHA,
-	SSL_HANDSHAKE_MAC_GOST94,0
+	SSL_HANDSHAKE_MAC_GOST94, 0, SSL_HANDSHAKE_MAC_SHA256,
+	SSL_HANDSHAKE_MAC_SHA384
 	};
 
 #define CIPHER_ADD	1
@@ -247,6 +253,7 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_ECDH,0,    SSL_kECDHr|SSL_kECDHe|SSL_kEECDH,0,0,0,0,0,0,0,0},
 
         {0,SSL_TXT_kPSK,0,    SSL_kPSK,  0,0,0,0,0,0,0,0},
+	{0,SSL_TXT_kSRP,0,    SSL_kSRP,  0,0,0,0,0,0,0,0},
 	{0,SSL_TXT_kGOST,0, SSL_kGOST,0,0,0,0,0,0,0,0},
 
 	/* server authentication aliases */
@@ -273,6 +280,7 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_ADH,0,     SSL_kEDH,SSL_aNULL,0,0,0,0,0,0,0},
 	{0,SSL_TXT_AECDH,0,   SSL_kEECDH,SSL_aNULL,0,0,0,0,0,0,0},
         {0,SSL_TXT_PSK,0,     SSL_kPSK,SSL_aPSK,0,0,0,0,0,0,0},
+	{0,SSL_TXT_SRP,0,     SSL_kSRP,0,0,0,0,0,0,0,0},
 
 
 	/* symmetric encryption aliases */
@@ -283,9 +291,10 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_IDEA,0,    0,0,SSL_IDEA,  0,0,0,0,0,0},
 	{0,SSL_TXT_SEED,0,    0,0,SSL_SEED,  0,0,0,0,0,0},
 	{0,SSL_TXT_eNULL,0,   0,0,SSL_eNULL, 0,0,0,0,0,0},
-	{0,SSL_TXT_AES128,0,  0,0,SSL_AES128,0,0,0,0,0,0},
-	{0,SSL_TXT_AES256,0,  0,0,SSL_AES256,0,0,0,0,0,0},
-	{0,SSL_TXT_AES,0,     0,0,SSL_AES128|SSL_AES256,0,0,0,0,0,0},
+	{0,SSL_TXT_AES128,0,  0,0,SSL_AES128|SSL_AES128GCM,0,0,0,0,0,0},
+	{0,SSL_TXT_AES256,0,  0,0,SSL_AES256|SSL_AES256GCM,0,0,0,0,0,0},
+	{0,SSL_TXT_AES,0,     0,0,SSL_AES,0,0,0,0,0,0},
+	{0,SSL_TXT_AES_GCM,0, 0,0,SSL_AES128GCM|SSL_AES256GCM,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA128,0,0,0,SSL_CAMELLIA128,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA256,0,0,0,SSL_CAMELLIA256,0,0,0,0,0,0},
 	{0,SSL_TXT_CAMELLIA   ,0,0,0,SSL_CAMELLIA128|SSL_CAMELLIA256,0,0,0,0,0,0},
@@ -296,6 +305,8 @@ static const SSL_CIPHER cipher_aliases[]={
 	{0,SSL_TXT_SHA,0,     0,0,0,SSL_SHA1,  0,0,0,0,0},
 	{0,SSL_TXT_GOST94,0,     0,0,0,SSL_GOST94,  0,0,0,0,0},
 	{0,SSL_TXT_GOST89MAC,0,     0,0,0,SSL_GOST89MAC,  0,0,0,0,0},
+	{0,SSL_TXT_SHA256,0,    0,0,0,SSL_SHA256,  0,0,0,0,0},
+	{0,SSL_TXT_SHA384,0,    0,0,0,SSL_SHA384,  0,0,0,0,0},
 
 	/* protocol version aliases */
 	{0,SSL_TXT_SSLV2,0,   0,0,0,0,SSL_SSLV2, 0,0,0,0},
@@ -379,6 +390,11 @@ void ssl_load_ciphers(void)
 	ssl_cipher_methods[SSL_ENC_SEED_IDX]=
 	  EVP_get_cipherbyname(SN_seed_cbc);
 
+	ssl_cipher_methods[SSL_ENC_AES128GCM_IDX]=
+	  EVP_get_cipherbyname(SN_aes_128_gcm);
+	ssl_cipher_methods[SSL_ENC_AES256GCM_IDX]=
+	  EVP_get_cipherbyname(SN_aes_256_gcm);
+
 	ssl_digest_methods[SSL_MD_MD5_IDX]=
 		EVP_get_digestbyname(SN_md5);
 	ssl_mac_secret_size[SSL_MD_MD5_IDX]=
@@ -404,6 +420,14 @@ void ssl_load_ciphers(void)
 			ssl_mac_secret_size[SSL_MD_GOST89MAC_IDX]=32;
 		}		
 
+	ssl_digest_methods[SSL_MD_SHA256_IDX]=
+		EVP_get_digestbyname(SN_sha256);
+	ssl_mac_secret_size[SSL_MD_SHA256_IDX]=
+		EVP_MD_size(ssl_digest_methods[SSL_MD_SHA256_IDX]);
+	ssl_digest_methods[SSL_MD_SHA384_IDX]=
+		EVP_get_digestbyname(SN_sha384);
+	ssl_mac_secret_size[SSL_MD_SHA384_IDX]=
+		EVP_MD_size(ssl_digest_methods[SSL_MD_SHA384_IDX]);
 	}
 #ifndef OPENSSL_NO_COMP
 
@@ -526,6 +550,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
 	case SSL_SEED:
 		i=SSL_ENC_SEED_IDX;
 		break;
+	case SSL_AES128GCM:
+		i=SSL_ENC_AES128GCM_IDX;
+		break;
+	case SSL_AES256GCM:
+		i=SSL_ENC_AES256GCM_IDX;
+		break;
 	default:
 		i= -1;
 		break;
@@ -549,6 +579,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
 	case SSL_SHA1:
 		i=SSL_MD_SHA1_IDX;
 		break;
+	case SSL_SHA256:
+		i=SSL_MD_SHA256_IDX;
+		break;
+	case SSL_SHA384:
+		i=SSL_MD_SHA384_IDX;
+		break;
 	case SSL_GOST94:
 		i = SSL_MD_GOST94_IDX;
 		break;
@@ -564,17 +600,45 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc,
 		*md=NULL; 
 		if (mac_pkey_type!=NULL) *mac_pkey_type = NID_undef;
 		if (mac_secret_size!=NULL) *mac_secret_size = 0;
-
+		if (c->algorithm_mac == SSL_AEAD)
+			mac_pkey_type = NULL;
 	}
 	else
 	{
 		*md=ssl_digest_methods[i];
 		if (mac_pkey_type!=NULL) *mac_pkey_type = ssl_mac_pkey_id[i];
 		if (mac_secret_size!=NULL) *mac_secret_size = ssl_mac_secret_size[i];
-	}	
+	}
+
+	if ((*enc != NULL) &&
+	    (*md != NULL || (EVP_CIPHER_flags(*enc)&EVP_CIPH_FLAG_AEAD_CIPHER)) &&
+	    (!mac_pkey_type||*mac_pkey_type != NID_undef))
+		{
+		const EVP_CIPHER *evp;
+
+		if (s->ssl_version>>8 != TLS1_VERSION_MAJOR ||
+		    s->ssl_version < TLS1_VERSION)
+			return 1;
 
-	if ((*enc != NULL) && (*md != NULL) && (!mac_pkey_type||*mac_pkey_type != NID_undef))
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return 1;
+#endif
+
+		if	(c->algorithm_enc == SSL_RC4 &&
+			 c->algorithm_mac == SSL_MD5 &&
+			 (evp=EVP_get_cipherbyname("RC4-HMAC-MD5")))
+			*enc = evp, *md = NULL;
+		else if (c->algorithm_enc == SSL_AES128 &&
+			 c->algorithm_mac == SSL_SHA1 &&
+			 (evp=EVP_get_cipherbyname("AES-128-CBC-HMAC-SHA1")))
+			*enc = evp, *md = NULL;
+		else if (c->algorithm_enc == SSL_AES256 &&
+			 c->algorithm_mac == SSL_SHA1 &&
+			 (evp=EVP_get_cipherbyname("AES-256-CBC-HMAC-SHA1")))
+			*enc = evp, *md = NULL;
 		return(1);
+		}
 	else
 		return(0);
 	}
@@ -585,9 +649,11 @@ int ssl_get_handshake_digest(int idx, long *mask, const EVP_MD **md)
 		{
 		return 0;
 		}
-	if (ssl_handshake_digest_flag[idx]==0) return 0;
 	*mask = ssl_handshake_digest_flag[idx];
-	*md = ssl_digest_methods[idx];
+	if (*mask)
+		*md = ssl_digest_methods[idx];
+	else
+		*md = NULL;
 	return 1;
 }
 
@@ -661,6 +727,9 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un
 #ifdef OPENSSL_NO_PSK
 	*mkey |= SSL_kPSK;
 	*auth |= SSL_aPSK;
+#endif
+#ifdef OPENSSL_NO_SRP
+	*mkey |= SSL_kSRP;
 #endif
 	/* Check for presence of GOST 34.10 algorithms, and if they
 	 * do not present, disable  appropriate auth and key exchange */
@@ -687,6 +756,8 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un
 	*enc |= (ssl_cipher_methods[SSL_ENC_IDEA_IDX] == NULL) ? SSL_IDEA:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_AES128_IDX] == NULL) ? SSL_AES128:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_AES256_IDX] == NULL) ? SSL_AES256:0;
+	*enc |= (ssl_cipher_methods[SSL_ENC_AES128GCM_IDX] == NULL) ? SSL_AES128GCM:0;
+	*enc |= (ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] == NULL) ? SSL_AES256GCM:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA128_IDX] == NULL) ? SSL_CAMELLIA128:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA256_IDX] == NULL) ? SSL_CAMELLIA256:0;
 	*enc |= (ssl_cipher_methods[SSL_ENC_GOST89_IDX] == NULL) ? SSL_eGOST2814789CNT:0;
@@ -694,6 +765,8 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un
 
 	*mac |= (ssl_digest_methods[SSL_MD_MD5_IDX ] == NULL) ? SSL_MD5 :0;
 	*mac |= (ssl_digest_methods[SSL_MD_SHA1_IDX] == NULL) ? SSL_SHA1:0;
+	*mac |= (ssl_digest_methods[SSL_MD_SHA256_IDX] == NULL) ? SSL_SHA256:0;
+	*mac |= (ssl_digest_methods[SSL_MD_SHA384_IDX] == NULL) ? SSL_SHA384:0;
 	*mac |= (ssl_digest_methods[SSL_MD_GOST94_IDX] == NULL) ? SSL_GOST94:0;
 	*mac |= (ssl_digest_methods[SSL_MD_GOST89MAC_IDX] == NULL || ssl_mac_pkey_id[SSL_MD_GOST89MAC_IDX]==NID_undef)? SSL_GOST89MAC:0;
 
@@ -724,6 +797,9 @@ static void ssl_cipher_collect_ciphers(const SSL_METHOD *ssl_method,
 		c = ssl_method->get_cipher(i);
 		/* drop those that use any of that is not available */
 		if ((c != NULL) && c->valid &&
+#ifdef OPENSSL_FIPS
+		    (!FIPS_mode() || (c->algo_strength & SSL_FIPS)) &&
+#endif
 		    !(c->algorithm_mkey & disabled_mkey) &&
 		    !(c->algorithm_auth & disabled_auth) &&
 		    !(c->algorithm_enc & disabled_enc) &&
@@ -1423,7 +1499,11 @@ STACK_OF(SSL_CIPHER) *ssl_create_cipher_list(const SSL_METHOD *ssl_method,
 	 */
 	for (curr = head; curr != NULL; curr = curr->next)
 		{
+#ifdef OPENSSL_FIPS
+		if (curr->active && (!FIPS_mode() || curr->cipher->algo_strength & SSL_FIPS))
+#else
 		if (curr->active)
+#endif
 			{
 			sk_SSL_CIPHER_push(cipherstack, curr->cipher);
 #ifdef CIPHER_DEBUG
@@ -1480,6 +1560,8 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 		ver="SSLv2";
 	else if (alg_ssl & SSL_SSLV3)
 		ver="SSLv3";
+	else if (alg_ssl & SSL_TLSV1_2)
+		ver="TLSv1.2";
 	else
 		ver="unknown";
 
@@ -1512,6 +1594,9 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 	case SSL_kPSK:
 		kx="PSK";
 		break;
+	case SSL_kSRP:
+		kx="SRP";
+		break;
 	default:
 		kx="unknown";
 		}
@@ -1574,6 +1659,12 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 	case SSL_AES256:
 		enc="AES(256)";
 		break;
+	case SSL_AES128GCM:
+		enc="AESGCM(128)";
+		break;
+	case SSL_AES256GCM:
+		enc="AESGCM(256)";
+		break;
 	case SSL_CAMELLIA128:
 		enc="Camellia(128)";
 		break;
@@ -1596,6 +1687,15 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len)
 	case SSL_SHA1:
 		mac="SHA1";
 		break;
+	case SSL_SHA256:
+		mac="SHA256";
+		break;
+	case SSL_SHA384:
+		mac="SHA384";
+		break;
+	case SSL_AEAD:
+		mac="AEAD";
+		break;
 	default:
 		mac="unknown";
 		break;
@@ -1653,6 +1753,11 @@ int SSL_CIPHER_get_bits(const SSL_CIPHER *c, int *alg_bits)
 	return(ret);
 	}
 
+unsigned long SSL_CIPHER_get_id(const SSL_CIPHER *c)
+	{
+	return c->id;
+	}
+
 SSL_COMP *ssl3_comp_find(STACK_OF(SSL_COMP) *sk, int n)
 	{
 	SSL_COMP *ctmp;
diff --git a/src/lib/libssl/ssl_err.c b/src/lib/libssl/ssl_err.c
index e9be77109f..2577c6895a 100644
--- a/src/lib/libssl/ssl_err.c
+++ b/src/lib/libssl/ssl_err.c
@@ -80,6 +80,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_DTLS1_ACCEPT),	"DTLS1_ACCEPT"},
 {ERR_FUNC(SSL_F_DTLS1_ADD_CERT_TO_BUF),	"DTLS1_ADD_CERT_TO_BUF"},
 {ERR_FUNC(SSL_F_DTLS1_BUFFER_RECORD),	"DTLS1_BUFFER_RECORD"},
+{ERR_FUNC(SSL_F_DTLS1_CHECK_TIMEOUT_NUM),	"DTLS1_CHECK_TIMEOUT_NUM"},
 {ERR_FUNC(SSL_F_DTLS1_CLIENT_HELLO),	"DTLS1_CLIENT_HELLO"},
 {ERR_FUNC(SSL_F_DTLS1_CONNECT),	"DTLS1_CONNECT"},
 {ERR_FUNC(SSL_F_DTLS1_ENC),	"DTLS1_ENC"},
@@ -88,6 +89,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_DTLS1_GET_MESSAGE_FRAGMENT),	"DTLS1_GET_MESSAGE_FRAGMENT"},
 {ERR_FUNC(SSL_F_DTLS1_GET_RECORD),	"DTLS1_GET_RECORD"},
 {ERR_FUNC(SSL_F_DTLS1_HANDLE_TIMEOUT),	"DTLS1_HANDLE_TIMEOUT"},
+{ERR_FUNC(SSL_F_DTLS1_HEARTBEAT),	"DTLS1_HEARTBEAT"},
 {ERR_FUNC(SSL_F_DTLS1_OUTPUT_CERT_CHAIN),	"DTLS1_OUTPUT_CERT_CHAIN"},
 {ERR_FUNC(SSL_F_DTLS1_PREPROCESS_FRAGMENT),	"DTLS1_PREPROCESS_FRAGMENT"},
 {ERR_FUNC(SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE),	"DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE"},
@@ -156,6 +158,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL3_GET_KEY_EXCHANGE),	"SSL3_GET_KEY_EXCHANGE"},
 {ERR_FUNC(SSL_F_SSL3_GET_MESSAGE),	"SSL3_GET_MESSAGE"},
 {ERR_FUNC(SSL_F_SSL3_GET_NEW_SESSION_TICKET),	"SSL3_GET_NEW_SESSION_TICKET"},
+{ERR_FUNC(SSL_F_SSL3_GET_NEXT_PROTO),	"SSL3_GET_NEXT_PROTO"},
 {ERR_FUNC(SSL_F_SSL3_GET_RECORD),	"SSL3_GET_RECORD"},
 {ERR_FUNC(SSL_F_SSL3_GET_SERVER_CERTIFICATE),	"SSL3_GET_SERVER_CERTIFICATE"},
 {ERR_FUNC(SSL_F_SSL3_GET_SERVER_DONE),	"SSL3_GET_SERVER_DONE"},
@@ -180,10 +183,12 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL3_WRITE_PENDING),	"SSL3_WRITE_PENDING"},
 {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT),	"SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT),	"SSL_ADD_CLIENTHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT),	"SSL_ADD_CLIENTHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK),	"SSL_add_dir_cert_subjects_to_stack"},
 {ERR_FUNC(SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK),	"SSL_add_file_cert_subjects_to_stack"},
 {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT),	"SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT),	"SSL_ADD_SERVERHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT),	"SSL_ADD_SERVERHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_BAD_METHOD),	"SSL_BAD_METHOD"},
 {ERR_FUNC(SSL_F_SSL_BYTES_TO_CIPHER_LIST),	"SSL_BYTES_TO_CIPHER_LIST"},
 {ERR_FUNC(SSL_F_SSL_CERT_DUP),	"SSL_CERT_DUP"},
@@ -200,6 +205,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_CREATE_CIPHER_LIST),	"SSL_CREATE_CIPHER_LIST"},
 {ERR_FUNC(SSL_F_SSL_CTRL),	"SSL_ctrl"},
 {ERR_FUNC(SSL_F_SSL_CTX_CHECK_PRIVATE_KEY),	"SSL_CTX_check_private_key"},
+{ERR_FUNC(SSL_F_SSL_CTX_MAKE_PROFILES),	"SSL_CTX_MAKE_PROFILES"},
 {ERR_FUNC(SSL_F_SSL_CTX_NEW),	"SSL_CTX_new"},
 {ERR_FUNC(SSL_F_SSL_CTX_SET_CIPHER_LIST),	"SSL_CTX_set_cipher_list"},
 {ERR_FUNC(SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE),	"SSL_CTX_set_client_cert_engine"},
@@ -228,8 +234,10 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_NEW),	"SSL_new"},
 {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT),	"SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT),	"SSL_PARSE_CLIENTHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT),	"SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT),	"SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT"},
 {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT),	"SSL_PARSE_SERVERHELLO_TLSEXT"},
+{ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT),	"SSL_PARSE_SERVERHELLO_USE_SRTP_EXT"},
 {ERR_FUNC(SSL_F_SSL_PEEK),	"SSL_peek"},
 {ERR_FUNC(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT),	"SSL_PREPARE_CLIENTHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT),	"SSL_PREPARE_SERVERHELLO_TLSEXT"},
@@ -238,6 +246,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_RSA_PUBLIC_ENCRYPT),	"SSL_RSA_PUBLIC_ENCRYPT"},
 {ERR_FUNC(SSL_F_SSL_SESSION_NEW),	"SSL_SESSION_new"},
 {ERR_FUNC(SSL_F_SSL_SESSION_PRINT_FP),	"SSL_SESSION_print_fp"},
+{ERR_FUNC(SSL_F_SSL_SESSION_SET1_ID_CONTEXT),	"SSL_SESSION_set1_id_context"},
 {ERR_FUNC(SSL_F_SSL_SESS_CERT_NEW),	"SSL_SESS_CERT_NEW"},
 {ERR_FUNC(SSL_F_SSL_SET_CERT),	"SSL_SET_CERT"},
 {ERR_FUNC(SSL_F_SSL_SET_CIPHER_LIST),	"SSL_set_cipher_list"},
@@ -251,6 +260,7 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_SSL_SET_TRUST),	"SSL_set_trust"},
 {ERR_FUNC(SSL_F_SSL_SET_WFD),	"SSL_set_wfd"},
 {ERR_FUNC(SSL_F_SSL_SHUTDOWN),	"SSL_shutdown"},
+{ERR_FUNC(SSL_F_SSL_SRP_CTX_INIT),	"SSL_SRP_CTX_init"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_CONST_FUNCTION),	"SSL_UNDEFINED_CONST_FUNCTION"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_FUNCTION),	"SSL_UNDEFINED_FUNCTION"},
 {ERR_FUNC(SSL_F_SSL_UNDEFINED_VOID_FUNCTION),	"SSL_UNDEFINED_VOID_FUNCTION"},
@@ -270,6 +280,8 @@ static ERR_STRING_DATA SSL_str_functs[]=
 {ERR_FUNC(SSL_F_TLS1_CHANGE_CIPHER_STATE),	"TLS1_CHANGE_CIPHER_STATE"},
 {ERR_FUNC(SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT),	"TLS1_CHECK_SERVERHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_ENC),	"TLS1_ENC"},
+{ERR_FUNC(SSL_F_TLS1_EXPORT_KEYING_MATERIAL),	"TLS1_EXPORT_KEYING_MATERIAL"},
+{ERR_FUNC(SSL_F_TLS1_HEARTBEAT),	"SSL_F_TLS1_HEARTBEAT"},
 {ERR_FUNC(SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT),	"TLS1_PREPARE_CLIENTHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT),	"TLS1_PREPARE_SERVERHELLO_TLSEXT"},
 {ERR_FUNC(SSL_F_TLS1_PRF),	"tls1_prf"},
@@ -312,6 +324,13 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_BAD_RSA_MODULUS_LENGTH),"bad rsa modulus length"},
 {ERR_REASON(SSL_R_BAD_RSA_SIGNATURE)     ,"bad rsa signature"},
 {ERR_REASON(SSL_R_BAD_SIGNATURE)         ,"bad signature"},
+{ERR_REASON(SSL_R_BAD_SRP_A_LENGTH)      ,"bad srp a length"},
+{ERR_REASON(SSL_R_BAD_SRP_B_LENGTH)      ,"bad srp b length"},
+{ERR_REASON(SSL_R_BAD_SRP_G_LENGTH)      ,"bad srp g length"},
+{ERR_REASON(SSL_R_BAD_SRP_N_LENGTH)      ,"bad srp n length"},
+{ERR_REASON(SSL_R_BAD_SRP_S_LENGTH)      ,"bad srp s length"},
+{ERR_REASON(SSL_R_BAD_SRTP_MKI_VALUE)    ,"bad srtp mki value"},
+{ERR_REASON(SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST),"bad srtp protection profile list"},
 {ERR_REASON(SSL_R_BAD_SSL_FILETYPE)      ,"bad ssl filetype"},
 {ERR_REASON(SSL_R_BAD_SSL_SESSION_ID_LENGTH),"bad ssl session id length"},
 {ERR_REASON(SSL_R_BAD_STATE)             ,"bad state"},
@@ -350,12 +369,15 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE),"ecc cert should have rsa signature"},
 {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE),"ecc cert should have sha1 signature"},
 {ERR_REASON(SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER),"ecgroup too large for cipher"},
+{ERR_REASON(SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST),"empty srtp protection profile list"},
 {ERR_REASON(SSL_R_ENCRYPTED_LENGTH_TOO_LONG),"encrypted length too long"},
 {ERR_REASON(SSL_R_ERROR_GENERATING_TMP_RSA_KEY),"error generating tmp rsa key"},
 {ERR_REASON(SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST),"error in received cipher list"},
 {ERR_REASON(SSL_R_EXCESSIVE_MESSAGE_SIZE),"excessive message size"},
 {ERR_REASON(SSL_R_EXTRA_DATA_IN_MESSAGE) ,"extra data in message"},
 {ERR_REASON(SSL_R_GOT_A_FIN_BEFORE_A_CCS),"got a fin before a ccs"},
+{ERR_REASON(SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS),"got next proto before a ccs"},
+{ERR_REASON(SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION),"got next proto without seeing extension"},
 {ERR_REASON(SSL_R_HTTPS_PROXY_REQUEST)   ,"https proxy request"},
 {ERR_REASON(SSL_R_HTTP_REQUEST)          ,"http request"},
 {ERR_REASON(SSL_R_ILLEGAL_PADDING)       ,"illegal padding"},
@@ -364,6 +386,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_INVALID_COMMAND)       ,"invalid command"},
 {ERR_REASON(SSL_R_INVALID_COMPRESSION_ALGORITHM),"invalid compression algorithm"},
 {ERR_REASON(SSL_R_INVALID_PURPOSE)       ,"invalid purpose"},
+{ERR_REASON(SSL_R_INVALID_SRP_USERNAME)  ,"invalid srp username"},
 {ERR_REASON(SSL_R_INVALID_STATUS_RESPONSE),"invalid status response"},
 {ERR_REASON(SSL_R_INVALID_TICKET_KEYS_LENGTH),"invalid ticket keys length"},
 {ERR_REASON(SSL_R_INVALID_TRUST)         ,"invalid trust"},
@@ -393,6 +416,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_MISSING_RSA_CERTIFICATE),"missing rsa certificate"},
 {ERR_REASON(SSL_R_MISSING_RSA_ENCRYPTING_CERT),"missing rsa encrypting cert"},
 {ERR_REASON(SSL_R_MISSING_RSA_SIGNING_CERT),"missing rsa signing cert"},
+{ERR_REASON(SSL_R_MISSING_SRP_PARAM)     ,"can't find SRP server param"},
 {ERR_REASON(SSL_R_MISSING_TMP_DH_KEY)    ,"missing tmp dh key"},
 {ERR_REASON(SSL_R_MISSING_TMP_ECDH_KEY)  ,"missing tmp ecdh key"},
 {ERR_REASON(SSL_R_MISSING_TMP_RSA_KEY)   ,"missing tmp rsa key"},
@@ -422,6 +446,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_NO_RENEGOTIATION)      ,"no renegotiation"},
 {ERR_REASON(SSL_R_NO_REQUIRED_DIGEST)    ,"digest requred for handshake isn't computed"},
 {ERR_REASON(SSL_R_NO_SHARED_CIPHER)      ,"no shared cipher"},
+{ERR_REASON(SSL_R_NO_SRTP_PROFILES)      ,"no srtp profiles"},
 {ERR_REASON(SSL_R_NO_VERIFY_CALLBACK)    ,"no verify callback"},
 {ERR_REASON(SSL_R_NULL_SSL_CTX)          ,"null ssl ctx"},
 {ERR_REASON(SSL_R_NULL_SSL_METHOD_PASSED),"null ssl method passed"},
@@ -465,7 +490,12 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_SERVERHELLO_TLSEXT)    ,"serverhello tlsext"},
 {ERR_REASON(SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED),"session id context uninitialized"},
 {ERR_REASON(SSL_R_SHORT_READ)            ,"short read"},
+{ERR_REASON(SSL_R_SIGNATURE_ALGORITHMS_ERROR),"signature algorithms error"},
 {ERR_REASON(SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE),"signature for non signing certificate"},
+{ERR_REASON(SSL_R_SRP_A_CALC)            ,"error with the srp params"},
+{ERR_REASON(SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES),"srtp could not allocate profiles"},
+{ERR_REASON(SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG),"srtp protection profile list too long"},
+{ERR_REASON(SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE),"srtp unknown protection profile"},
 {ERR_REASON(SSL_R_SSL23_DOING_SESSION_ID_REUSE),"ssl23 doing session id reuse"},
 {ERR_REASON(SSL_R_SSL2_CONNECTION_ID_TOO_LONG),"ssl2 connection id too long"},
 {ERR_REASON(SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT),"ssl3 ext invalid ecpointformat"},
@@ -510,6 +540,9 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_TLSV1_UNRECOGNIZED_NAME),"tlsv1 unrecognized name"},
 {ERR_REASON(SSL_R_TLSV1_UNSUPPORTED_EXTENSION),"tlsv1 unsupported extension"},
 {ERR_REASON(SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER),"tls client cert req with anon cipher"},
+{ERR_REASON(SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT),"peer does not accept heartbearts"},
+{ERR_REASON(SSL_R_TLS_HEARTBEAT_PENDING) ,"heartbeat request already pending"},
+{ERR_REASON(SSL_R_TLS_ILLEGAL_EXPORTER_LABEL),"tls illegal exporter label"},
 {ERR_REASON(SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST),"tls invalid ecpointformat list"},
 {ERR_REASON(SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST),"tls peer did not respond with certificate list"},
 {ERR_REASON(SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG),"tls rsa encrypted value length is wrong"},
@@ -531,6 +564,7 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_UNKNOWN_CERTIFICATE_TYPE),"unknown certificate type"},
 {ERR_REASON(SSL_R_UNKNOWN_CIPHER_RETURNED),"unknown cipher returned"},
 {ERR_REASON(SSL_R_UNKNOWN_CIPHER_TYPE)   ,"unknown cipher type"},
+{ERR_REASON(SSL_R_UNKNOWN_DIGEST)        ,"unknown digest"},
 {ERR_REASON(SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE),"unknown key exchange type"},
 {ERR_REASON(SSL_R_UNKNOWN_PKEY_TYPE)     ,"unknown pkey type"},
 {ERR_REASON(SSL_R_UNKNOWN_PROTOCOL)      ,"unknown protocol"},
@@ -545,12 +579,14 @@ static ERR_STRING_DATA SSL_str_reasons[]=
 {ERR_REASON(SSL_R_UNSUPPORTED_PROTOCOL)  ,"unsupported protocol"},
 {ERR_REASON(SSL_R_UNSUPPORTED_SSL_VERSION),"unsupported ssl version"},
 {ERR_REASON(SSL_R_UNSUPPORTED_STATUS_TYPE),"unsupported status type"},
+{ERR_REASON(SSL_R_USE_SRTP_NOT_NEGOTIATED),"use srtp not negotiated"},
 {ERR_REASON(SSL_R_WRITE_BIO_NOT_SET)     ,"write bio not set"},
 {ERR_REASON(SSL_R_WRONG_CIPHER_RETURNED) ,"wrong cipher returned"},
 {ERR_REASON(SSL_R_WRONG_MESSAGE_TYPE)    ,"wrong message type"},
 {ERR_REASON(SSL_R_WRONG_NUMBER_OF_KEY_BITS),"wrong number of key bits"},
 {ERR_REASON(SSL_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"},
 {ERR_REASON(SSL_R_WRONG_SIGNATURE_SIZE)  ,"wrong signature size"},
+{ERR_REASON(SSL_R_WRONG_SIGNATURE_TYPE)  ,"wrong signature type"},
 {ERR_REASON(SSL_R_WRONG_SSL_VERSION)     ,"wrong ssl version"},
 {ERR_REASON(SSL_R_WRONG_VERSION_NUMBER)  ,"wrong version number"},
 {ERR_REASON(SSL_R_X509_LIB)              ,"x509 lib"},
diff --git a/src/lib/libssl/ssl_lib.c b/src/lib/libssl/ssl_lib.c
index 8e89911f48..f82d071d6e 100644
--- a/src/lib/libssl/ssl_lib.c
+++ b/src/lib/libssl/ssl_lib.c
@@ -176,7 +176,10 @@ SSL3_ENC_METHOD ssl3_undef_enc_method={
 	0,	/* client_finished_label_len */
 	NULL,	/* server_finished_label */
 	0,	/* server_finished_label_len */
-	(int (*)(int))ssl_undefined_function
+	(int (*)(int))ssl_undefined_function,
+	(int (*)(SSL *, unsigned char *, size_t, const char *,
+		 size_t, const unsigned char *, size_t,
+		 int use_context)) ssl_undefined_function,
 	};
 
 int SSL_clear(SSL *s)
@@ -202,9 +205,9 @@ int SSL_clear(SSL *s)
        * needed because SSL_clear is not called when doing renegotiation) */
 	/* This is set if we are doing dynamic renegotiation so keep
 	 * the old cipher.  It is sort of a SSL_clear_lite :-) */
-	if (s->new_session) return(1);
+	if (s->renegotiate) return(1);
 #else
-	if (s->new_session)
+	if (s->renegotiate)
 		{
 		SSLerr(SSL_F_SSL_CLEAR,ERR_R_INTERNAL_ERROR);
 		return 0;
@@ -353,6 +356,9 @@ SSL *SSL_new(SSL_CTX *ctx)
 	s->tlsext_ocsp_resplen = -1;
 	CRYPTO_add(&ctx->references,1,CRYPTO_LOCK_SSL_CTX);
 	s->initial_ctx=ctx;
+# ifndef OPENSSL_NO_NEXTPROTONEG
+	s->next_proto_negotiated = NULL;
+# endif
 #endif
 
 	s->verify_result=X509_V_OK;
@@ -586,6 +592,14 @@ void SSL_free(SSL *s)
 		kssl_ctx_free(s->kssl_ctx);
 #endif	/* OPENSSL_NO_KRB5 */
 
+#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG)
+	if (s->next_proto_negotiated)
+		OPENSSL_free(s->next_proto_negotiated);
+#endif
+
+        if (s->srtp_profiles)
+            sk_SRTP_PROTECTION_PROFILE_free(s->srtp_profiles);
+
 	OPENSSL_free(s);
 	}
 
@@ -1008,10 +1022,21 @@ int SSL_shutdown(SSL *s)
 
 int SSL_renegotiate(SSL *s)
 	{
-	if (s->new_session == 0)
-		{
-		s->new_session=1;
-		}
+	if (s->renegotiate == 0)
+		s->renegotiate=1;
+
+	s->new_session=1;
+
+	return(s->method->ssl_renegotiate(s));
+	}
+
+int SSL_renegotiate_abbreviated(SSL *s)
+	{
+	if (s->renegotiate == 0)
+		s->renegotiate=1;
+
+	s->new_session=0;
+
 	return(s->method->ssl_renegotiate(s));
 	}
 
@@ -1019,7 +1044,7 @@ int SSL_renegotiate_pending(SSL *s)
 	{
 	/* becomes true when negotiation is requested;
 	 * false again once a handshake has finished */
-	return (s->new_session != 0);
+	return (s->renegotiate != 0);
 	}
 
 long SSL_ctrl(SSL *s,int cmd,long larg,void *parg)
@@ -1054,8 +1079,10 @@ long SSL_ctrl(SSL *s,int cmd,long larg,void *parg)
 		s->max_cert_list=larg;
 		return(l);
 	case SSL_CTRL_SET_MTU:
+#ifndef OPENSSL_NO_DTLS1
 		if (larg < (long)dtls1_min_mtu())
 			return 0;
+#endif
 
 		if (SSL_version(s) == DTLS1_VERSION ||
 		    SSL_version(s) == DTLS1_BAD_VER)
@@ -1358,6 +1385,10 @@ int ssl_cipher_list_to_bytes(SSL *s,STACK_OF(SSL_CIPHER) *sk,unsigned char *p,
 	for (i=0; i<sk_SSL_CIPHER_num(sk); i++)
 		{
 		c=sk_SSL_CIPHER_value(sk,i);
+		/* Skip TLS v1.2 only ciphersuites if lower than v1.2 */
+		if ((c->algorithm_ssl & SSL_TLSV1_2) && 
+			(TLS1_get_client_version(s) < TLS1_2_VERSION))
+			continue;
 #ifndef OPENSSL_NO_KRB5
 		if (((c->algorithm_mkey & SSL_kKRB5) || (c->algorithm_auth & SSL_aKRB5)) &&
 		    nokrb5)
@@ -1375,7 +1406,7 @@ int ssl_cipher_list_to_bytes(SSL *s,STACK_OF(SSL_CIPHER) *sk,unsigned char *p,
 	/* If p == q, no ciphers and caller indicates an error. Otherwise
 	 * add SCSV if not renegotiating.
 	 */
-	if (p != q && !s->new_session)
+	if (p != q && !s->renegotiate)
 		{
 		static SSL_CIPHER scsv =
 			{
@@ -1422,7 +1453,7 @@ STACK_OF(SSL_CIPHER) *ssl_bytes_to_cipher_list(SSL *s,unsigned char *p,int num,
 			(p[n-1] == (SSL3_CK_SCSV & 0xff)))
 			{
 			/* SCSV fatal if renegotiating */
-			if (s->new_session)
+			if (s->renegotiate)
 				{
 				SSLerr(SSL_F_SSL_BYTES_TO_CIPHER_LIST,SSL_R_SCSV_RECEIVED_WHEN_RENEGOTIATING);
 				ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_HANDSHAKE_FAILURE); 
@@ -1479,8 +1510,137 @@ int SSL_get_servername_type(const SSL *s)
 		return TLSEXT_NAMETYPE_host_name;
 	return -1;
 	}
+
+# ifndef OPENSSL_NO_NEXTPROTONEG
+/* SSL_select_next_proto implements the standard protocol selection. It is
+ * expected that this function is called from the callback set by
+ * SSL_CTX_set_next_proto_select_cb.
+ *
+ * The protocol data is assumed to be a vector of 8-bit, length prefixed byte
+ * strings. The length byte itself is not included in the length. A byte
+ * string of length 0 is invalid. No byte string may be truncated.
+ *
+ * The current, but experimental algorithm for selecting the protocol is:
+ *
+ * 1) If the server doesn't support NPN then this is indicated to the
+ * callback. In this case, the client application has to abort the connection
+ * or have a default application level protocol.
+ *
+ * 2) If the server supports NPN, but advertises an empty list then the
+ * client selects the first protcol in its list, but indicates via the
+ * API that this fallback case was enacted.
+ *
+ * 3) Otherwise, the client finds the first protocol in the server's list
+ * that it supports and selects this protocol. This is because it's
+ * assumed that the server has better information about which protocol
+ * a client should use.
+ *
+ * 4) If the client doesn't support any of the server's advertised
+ * protocols, then this is treated the same as case 2.
+ *
+ * It returns either
+ * OPENSSL_NPN_NEGOTIATED if a common protocol was found, or
+ * OPENSSL_NPN_NO_OVERLAP if the fallback case was reached.
+ */
+int SSL_select_next_proto(unsigned char **out, unsigned char *outlen, const unsigned char *server, unsigned int server_len, const unsigned char *client, unsigned int client_len)
+	{
+	unsigned int i, j;
+	const unsigned char *result;
+	int status = OPENSSL_NPN_UNSUPPORTED;
+
+	/* For each protocol in server preference order, see if we support it. */
+	for (i = 0; i < server_len; )
+		{
+		for (j = 0; j < client_len; )
+			{
+			if (server[i] == client[j] &&
+			    memcmp(&server[i+1], &client[j+1], server[i]) == 0)
+				{
+				/* We found a match */
+				result = &server[i];
+				status = OPENSSL_NPN_NEGOTIATED;
+				goto found;
+				}
+			j += client[j];
+			j++;
+			}
+		i += server[i];
+		i++;
+		}
+
+	/* There's no overlap between our protocols and the server's list. */
+	result = client;
+	status = OPENSSL_NPN_NO_OVERLAP;
+
+	found:
+	*out = (unsigned char *) result + 1;
+	*outlen = result[0];
+	return status;
+	}
+
+/* SSL_get0_next_proto_negotiated sets *data and *len to point to the client's
+ * requested protocol for this connection and returns 0. If the client didn't
+ * request any protocol, then *data is set to NULL.
+ *
+ * Note that the client can request any protocol it chooses. The value returned
+ * from this function need not be a member of the list of supported protocols
+ * provided by the callback.
+ */
+void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data, unsigned *len)
+	{
+	*data = s->next_proto_negotiated;
+	if (!*data) {
+		*len = 0;
+	} else {
+		*len = s->next_proto_negotiated_len;
+	}
+}
+
+/* SSL_CTX_set_next_protos_advertised_cb sets a callback that is called when a
+ * TLS server needs a list of supported protocols for Next Protocol
+ * Negotiation. The returned list must be in wire format.  The list is returned
+ * by setting |out| to point to it and |outlen| to its length. This memory will
+ * not be modified, but one should assume that the SSL* keeps a reference to
+ * it.
+ *
+ * The callback should return SSL_TLSEXT_ERR_OK if it wishes to advertise. Otherwise, no
+ * such extension will be included in the ServerHello. */
+void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *ctx, int (*cb) (SSL *ssl, const unsigned char **out, unsigned int *outlen, void *arg), void *arg)
+	{
+	ctx->next_protos_advertised_cb = cb;
+	ctx->next_protos_advertised_cb_arg = arg;
+	}
+
+/* SSL_CTX_set_next_proto_select_cb sets a callback that is called when a
+ * client needs to select a protocol from the server's provided list. |out|
+ * must be set to point to the selected protocol (which may be within |in|).
+ * The length of the protocol name must be written into |outlen|. The server's
+ * advertised protocols are provided in |in| and |inlen|. The callback can
+ * assume that |in| is syntactically valid.
+ *
+ * The client must select a protocol. It is fatal to the connection if this
+ * callback returns a value other than SSL_TLSEXT_ERR_OK.
+ */
+void SSL_CTX_set_next_proto_select_cb(SSL_CTX *ctx, int (*cb) (SSL *s, unsigned char **out, unsigned char *outlen, const unsigned char *in, unsigned int inlen, void *arg), void *arg)
+	{
+	ctx->next_proto_select_cb = cb;
+	ctx->next_proto_select_cb_arg = arg;
+	}
+# endif
 #endif
 
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context)
+	{
+	if (s->version < TLS1_VERSION)
+		return -1;
+
+	return s->method->ssl3_enc->export_keying_material(s, out, olen, label,
+							   llen, p, plen,
+							   use_context);
+	}
+
 static unsigned long ssl_session_hash(const SSL_SESSION *a)
 	{
 	unsigned long l;
@@ -1524,6 +1684,14 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth)
 		return(NULL);
 		}
 
+#ifdef OPENSSL_FIPS
+	if (FIPS_mode() && (meth->version < TLS1_VERSION))	
+		{
+		SSLerr(SSL_F_SSL_CTX_NEW, SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE);
+		return NULL;
+		}
+#endif
+
 	if (SSL_get_ex_data_X509_STORE_CTX_idx() < 0)
 		{
 		SSLerr(SSL_F_SSL_CTX_NEW,SSL_R_X509_VERIFICATION_SETUP_PROBLEMS);
@@ -1643,12 +1811,19 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth)
 	ret->tlsext_status_cb = 0;
 	ret->tlsext_status_arg = NULL;
 
+# ifndef OPENSSL_NO_NEXTPROTONEG
+	ret->next_protos_advertised_cb = 0;
+	ret->next_proto_select_cb = 0;
+# endif
 #endif
 #ifndef OPENSSL_NO_PSK
 	ret->psk_identity_hint=NULL;
 	ret->psk_client_callback=NULL;
 	ret->psk_server_callback=NULL;
 #endif
+#ifndef OPENSSL_NO_SRP
+	SSL_CTX_SRP_CTX_init(ret);
+#endif
 #ifndef OPENSSL_NO_BUF_FREELISTS
 	ret->freelist_max_len = SSL_MAX_BUF_FREELIST_LEN_DEFAULT;
 	ret->rbuf_freelist = OPENSSL_malloc(sizeof(SSL3_BUF_FREELIST));
@@ -1777,10 +1952,16 @@ void SSL_CTX_free(SSL_CTX *a)
 	a->comp_methods = NULL;
 #endif
 
+        if (a->srtp_profiles)
+                sk_SRTP_PROTECTION_PROFILE_free(a->srtp_profiles);
+
 #ifndef OPENSSL_NO_PSK
 	if (a->psk_identity_hint)
 		OPENSSL_free(a->psk_identity_hint);
 #endif
+#ifndef OPENSSL_NO_SRP
+	SSL_CTX_SRP_CTX_free(a);
+#endif
 #ifndef OPENSSL_NO_ENGINE
 	if (a->client_cert_engine)
 		ENGINE_finish(a->client_cert_engine);
@@ -2034,12 +2215,13 @@ void ssl_set_cert_masks(CERT *c, const SSL_CIPHER *cipher)
 
 #ifndef OPENSSL_NO_EC
 
-int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
+int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s)
 	{
 	unsigned long alg_k, alg_a;
 	EVP_PKEY *pkey = NULL;
 	int keysize = 0;
 	int signature_nid = 0, md_nid = 0, pk_nid = 0;
+	const SSL_CIPHER *cs = s->s3->tmp.new_cipher;
 
 	alg_k = cs->algorithm_mkey;
 	alg_a = cs->algorithm_auth;
@@ -2069,7 +2251,7 @@ int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
 			SSLerr(SSL_F_SSL_CHECK_SRVR_ECC_CERT_AND_ALG, SSL_R_ECC_CERT_NOT_FOR_KEY_AGREEMENT);
 			return 0;
 			}
-		if (alg_k & SSL_kECDHe)
+		if ((alg_k & SSL_kECDHe) && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			/* signature alg must be ECDSA */
 			if (pk_nid != NID_X9_62_id_ecPublicKey)
@@ -2078,7 +2260,7 @@ int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs)
 				return 0;
 				}
 			}
-		if (alg_k & SSL_kECDHr)
+		if ((alg_k & SSL_kECDHr) && TLS1_get_version(s) < TLS1_2_VERSION)
 			{
 			/* signature alg must be RSA */
 
@@ -2168,34 +2350,36 @@ X509 *ssl_get_server_send_cert(SSL *s)
 	return(c->pkeys[i].x509);
 	}
 
-EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher)
+EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher, const EVP_MD **pmd)
 	{
 	unsigned long alg_a;
 	CERT *c;
+	int idx = -1;
 
 	alg_a = cipher->algorithm_auth;
 	c=s->cert;
 
 	if ((alg_a & SSL_aDSS) &&
 		(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey != NULL))
-		return(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey);
+		idx = SSL_PKEY_DSA_SIGN;
 	else if (alg_a & SSL_aRSA)
 		{
 		if (c->pkeys[SSL_PKEY_RSA_SIGN].privatekey != NULL)
-			return(c->pkeys[SSL_PKEY_RSA_SIGN].privatekey);
+			idx = SSL_PKEY_RSA_SIGN;
 		else if (c->pkeys[SSL_PKEY_RSA_ENC].privatekey != NULL)
-			return(c->pkeys[SSL_PKEY_RSA_ENC].privatekey);
-		else
-			return(NULL);
+			idx = SSL_PKEY_RSA_ENC;
 		}
 	else if ((alg_a & SSL_aECDSA) &&
 	         (c->pkeys[SSL_PKEY_ECC].privatekey != NULL))
-		return(c->pkeys[SSL_PKEY_ECC].privatekey);
-	else /* if (alg_a & SSL_aNULL) */
+		idx = SSL_PKEY_ECC;
+	if (idx == -1)
 		{
 		SSLerr(SSL_F_SSL_GET_SIGN_PKEY,ERR_R_INTERNAL_ERROR);
 		return(NULL);
 		}
+	if (pmd)
+		*pmd = c->pkeys[idx].digest;
+	return c->pkeys[idx].privatekey;
 	}
 
 void ssl_update_cache(SSL *s,int mode)
@@ -2420,6 +2604,10 @@ SSL_METHOD *ssl_bad_method(int ver)
 
 const char *SSL_get_version(const SSL *s)
 	{
+	if (s->version == TLS1_2_VERSION)
+		return("TLSv1.2");
+	else if (s->version == TLS1_1_VERSION)
+		return("TLSv1.1");
 	if (s->version == TLS1_VERSION)
 		return("TLSv1");
 	else if (s->version == SSL3_VERSION)
@@ -2514,6 +2702,7 @@ SSL *SSL_dup(SSL *s)
 	ret->in_handshake = s->in_handshake;
 	ret->handshake_func = s->handshake_func;
 	ret->server = s->server;
+	ret->renegotiate = s->renegotiate;
 	ret->new_session = s->new_session;
 	ret->quiet_shutdown = s->quiet_shutdown;
 	ret->shutdown=s->shutdown;
@@ -2779,6 +2968,11 @@ int SSL_state(const SSL *ssl)
 	return(ssl->state);
 	}
 
+void SSL_set_state(SSL *ssl, int state)
+	{
+	ssl->state = state;
+	}
+
 void SSL_set_verify_result(SSL *ssl,long arg)
 	{
 	ssl->verify_result=arg;
@@ -3037,6 +3231,16 @@ void ssl_clear_hash_ctx(EVP_MD_CTX **hash)
 	*hash=NULL;
 }
 
+void SSL_set_debug(SSL *s, int debug)
+	{
+	s->debug = debug;
+	}
+
+int SSL_cache_hit(SSL *s)
+	{
+	return s->hit;
+	}
+
 #if defined(_WINDLL) && defined(OPENSSL_SYS_WIN16)
 #include "../crypto/bio/bss_file.c"
 #endif
@@ -3045,4 +3249,3 @@ IMPLEMENT_STACK_OF(SSL_CIPHER)
 IMPLEMENT_STACK_OF(SSL_COMP)
 IMPLEMENT_OBJ_BSEARCH_GLOBAL_CMP_FN(SSL_CIPHER, SSL_CIPHER,
 				    ssl_cipher_id);
-
diff --git a/src/lib/libssl/ssl_locl.h b/src/lib/libssl/ssl_locl.h
index cea622a2a6..d87fd51cfa 100644
--- a/src/lib/libssl/ssl_locl.h
+++ b/src/lib/libssl/ssl_locl.h
@@ -170,7 +170,7 @@
 # define OPENSSL_EXTERN OPENSSL_EXPORT
 #endif
 
-#define PKCS1_CHECK
+#undef PKCS1_CHECK
 
 #define c2l(c,l)	(l = ((unsigned long)(*((c)++)))     , \
 			 l|=(((unsigned long)(*((c)++)))<< 8), \
@@ -289,6 +289,7 @@
 #define SSL_kEECDH		0x00000080L /* ephemeral ECDH */
 #define SSL_kPSK		0x00000100L /* PSK */
 #define SSL_kGOST       0x00000200L /* GOST key exchange */
+#define SSL_kSRP        0x00000400L /* SRP */
 
 /* Bits for algorithm_auth (server authentication) */
 #define SSL_aRSA		0x00000001L /* RSA auth */
@@ -316,21 +317,29 @@
 #define SSL_CAMELLIA256		0x00000200L
 #define SSL_eGOST2814789CNT	0x00000400L
 #define SSL_SEED		0x00000800L
+#define SSL_AES128GCM		0x00001000L
+#define SSL_AES256GCM		0x00002000L
 
-#define SSL_AES        		(SSL_AES128|SSL_AES256)
+#define SSL_AES        		(SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM)
 #define SSL_CAMELLIA		(SSL_CAMELLIA128|SSL_CAMELLIA256)
 
 
 /* Bits for algorithm_mac (symmetric authentication) */
+
 #define SSL_MD5			0x00000001L
 #define SSL_SHA1		0x00000002L
 #define SSL_GOST94      0x00000004L
 #define SSL_GOST89MAC   0x00000008L
+#define SSL_SHA256		0x00000010L
+#define SSL_SHA384		0x00000020L
+/* Not a real MAC, just an indication it is part of cipher */
+#define SSL_AEAD		0x00000040L
 
 /* Bits for algorithm_ssl (protocol version) */
 #define SSL_SSLV2		0x00000001L
 #define SSL_SSLV3		0x00000002L
 #define SSL_TLSV1		SSL_SSLV3	/* for now */
+#define SSL_TLSV1_2		0x00000004L
 
 
 /* Bits for algorithm2 (handshake digests and other extra flags) */
@@ -338,15 +347,21 @@
 #define SSL_HANDSHAKE_MAC_MD5 0x10
 #define SSL_HANDSHAKE_MAC_SHA 0x20
 #define SSL_HANDSHAKE_MAC_GOST94 0x40
+#define SSL_HANDSHAKE_MAC_SHA256 0x80
+#define SSL_HANDSHAKE_MAC_SHA384 0x100
 #define SSL_HANDSHAKE_MAC_DEFAULT (SSL_HANDSHAKE_MAC_MD5 | SSL_HANDSHAKE_MAC_SHA)
 
 /* When adding new digest in the ssl_ciph.c and increment SSM_MD_NUM_IDX
  * make sure to update this constant too */
-#define SSL_MAX_DIGEST 4
+#define SSL_MAX_DIGEST 6
+
+#define TLS1_PRF_DGST_MASK	(0xff << TLS1_PRF_DGST_SHIFT)
 
-#define TLS1_PRF_DGST_SHIFT 8
+#define TLS1_PRF_DGST_SHIFT 10
 #define TLS1_PRF_MD5 (SSL_HANDSHAKE_MAC_MD5 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF_SHA1 (SSL_HANDSHAKE_MAC_SHA << TLS1_PRF_DGST_SHIFT)
+#define TLS1_PRF_SHA256 (SSL_HANDSHAKE_MAC_SHA256 << TLS1_PRF_DGST_SHIFT)
+#define TLS1_PRF_SHA384 (SSL_HANDSHAKE_MAC_SHA384 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF_GOST94 (SSL_HANDSHAKE_MAC_GOST94 << TLS1_PRF_DGST_SHIFT)
 #define TLS1_PRF (TLS1_PRF_MD5 | TLS1_PRF_SHA1)
 
@@ -457,6 +472,8 @@ typedef struct cert_pkey_st
 	{
 	X509 *x509;
 	EVP_PKEY *privatekey;
+	/* Digest to use when signing */
+	const EVP_MD *digest;
 	} CERT_PKEY;
 
 typedef struct cert_st
@@ -554,6 +571,10 @@ typedef struct ssl3_enc_method
 	const char *server_finished_label;
 	int server_finished_label_len;
 	int (*alert_value)(int);
+	int (*export_keying_material)(SSL *, unsigned char *, size_t,
+				      const char *, size_t,
+				      const unsigned char *, size_t,
+				      int use_context);
 	} SSL3_ENC_METHOD;
 
 #ifndef OPENSSL_NO_COMP
@@ -591,11 +612,12 @@ extern SSL3_ENC_METHOD TLSv1_enc_data;
 extern SSL3_ENC_METHOD SSLv3_enc_data;
 extern SSL3_ENC_METHOD DTLSv1_enc_data;
 
-#define IMPLEMENT_tls1_meth_func(func_name, s_accept, s_connect, s_get_meth) \
+#define IMPLEMENT_tls_meth_func(version, func_name, s_accept, s_connect, \
+				s_get_meth) \
 const SSL_METHOD *func_name(void)  \
 	{ \
 	static const SSL_METHOD func_name##_data= { \
-		TLS1_VERSION, \
+		version, \
 		tls1_new, \
 		tls1_clear, \
 		tls1_free, \
@@ -669,7 +691,7 @@ const SSL_METHOD *func_name(void)  \
 const SSL_METHOD *func_name(void)  \
 	{ \
 	static const SSL_METHOD func_name##_data= { \
-	TLS1_VERSION, \
+	TLS1_2_VERSION, \
 	tls1_new, \
 	tls1_clear, \
 	tls1_free, \
@@ -752,7 +774,7 @@ const SSL_METHOD *func_name(void)  \
 		ssl3_read, \
 		ssl3_peek, \
 		ssl3_write, \
-		ssl3_shutdown, \
+		dtls1_shutdown, \
 		ssl3_renegotiate, \
 		ssl3_renegotiate_check, \
 		dtls1_get_message, \
@@ -809,7 +831,7 @@ int ssl_undefined_function(SSL *s);
 int ssl_undefined_void_function(void);
 int ssl_undefined_const_function(const SSL *s);
 X509 *ssl_get_server_send_cert(SSL *);
-EVP_PKEY *ssl_get_sign_pkey(SSL *,const SSL_CIPHER *);
+EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *c, const EVP_MD **pmd);
 int ssl_cert_type(X509 *x,EVP_PKEY *pkey);
 void ssl_set_cert_masks(CERT *c, const SSL_CIPHER *cipher);
 STACK_OF(SSL_CIPHER) *ssl_get_ciphers_by_id(SSL *s);
@@ -943,6 +965,7 @@ void dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr);
 void dtls1_reset_seq_numbers(SSL *s, int rw);
 long dtls1_default_timeout(void);
 struct timeval* dtls1_get_timeout(SSL *s, struct timeval* timeleft);
+int dtls1_check_timeout_num(SSL *s);
 int dtls1_handle_timeout(SSL *s);
 const SSL_CIPHER *dtls1_get_cipher(unsigned int u);
 void dtls1_start_timer(SSL *s);
@@ -968,6 +991,9 @@ int ssl3_get_server_certificate(SSL *s);
 int ssl3_check_cert_and_algorithm(SSL *s);
 #ifndef OPENSSL_NO_TLSEXT
 int ssl3_check_finished(SSL *s);
+# ifndef OPENSSL_NO_NEXTPROTONEG
+int ssl3_send_next_proto(SSL *s);
+# endif
 #endif
 
 int dtls1_client_hello(SSL *s);
@@ -986,6 +1012,9 @@ int ssl3_check_client_hello(SSL *s);
 int ssl3_get_client_certificate(SSL *s);
 int ssl3_get_client_key_exchange(SSL *s);
 int ssl3_get_cert_verify(SSL *s);
+#ifndef OPENSSL_NO_NEXTPROTONEG
+int ssl3_get_next_proto(SSL *s);
+#endif
 
 int dtls1_send_hello_request(SSL *s);
 int dtls1_send_server_hello(SSL *s);
@@ -1013,6 +1042,7 @@ int	dtls1_connect(SSL *s);
 void dtls1_free(SSL *s);
 void dtls1_clear(SSL *s);
 long dtls1_ctrl(SSL *s,int cmd, long larg, void *parg);
+int dtls1_shutdown(SSL *s);
 
 long dtls1_get_message(SSL *s, int st1, int stn, int mt, long max, int *ok);
 int dtls1_get_record(SSL *s);
@@ -1033,12 +1063,15 @@ int tls1_cert_verify_mac(SSL *s, int md_nid, unsigned char *p);
 int tls1_mac(SSL *ssl, unsigned char *md, int snd);
 int tls1_generate_master_secret(SSL *s, unsigned char *out,
 	unsigned char *p, int len);
+int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen,
+	const unsigned char *p, size_t plen, int use_context);
 int tls1_alert_code(int code);
 int ssl3_alert_code(int code);
 int ssl_ok(SSL *s);
 
 #ifndef OPENSSL_NO_ECDH
-int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs);
+int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s);
 #endif
 
 SSL_COMP *ssl3_comp_find(STACK_OF(SSL_COMP) *sk, int n);
@@ -1058,6 +1091,13 @@ int ssl_prepare_serverhello_tlsext(SSL *s);
 int ssl_check_clienthello_tlsext(SSL *s);
 int ssl_check_serverhello_tlsext(SSL *s);
 
+#ifndef OPENSSL_NO_HEARTBEATS
+int tls1_heartbeat(SSL *s);
+int dtls1_heartbeat(SSL *s);
+int tls1_process_heartbeat(SSL *s);
+int dtls1_process_heartbeat(SSL *s);
+#endif
+
 #ifdef OPENSSL_NO_SHA256
 #define tlsext_tick_md	EVP_sha1
 #else
@@ -1065,6 +1105,12 @@ int ssl_check_serverhello_tlsext(SSL *s);
 #endif
 int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
 				const unsigned char *limit, SSL_SESSION **ret);
+
+int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk,
+				const EVP_MD *md);
+int tls12_get_sigid(const EVP_PKEY *pk);
+const EVP_MD *tls12_get_hash(unsigned char hash_alg);
+
 #endif
 EVP_MD_CTX* ssl_replace_hash(EVP_MD_CTX **hash,const EVP_MD *md) ;
 void ssl_clear_hash_ctx(EVP_MD_CTX **hash);
@@ -1076,4 +1122,13 @@ int ssl_add_clienthello_renegotiate_ext(SSL *s, unsigned char *p, int *len,
 					int maxlen);
 int ssl_parse_clienthello_renegotiate_ext(SSL *s, unsigned char *d, int len,
 					  int *al);
+long ssl_get_algorithm2(SSL *s);
+int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize);
+int tls12_get_req_sig_algs(SSL *s, unsigned char *p);
+
+int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen);
+int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al);
+int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen);
+int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al);
+
 #endif
diff --git a/src/lib/libssl/ssl_sess.c b/src/lib/libssl/ssl_sess.c
index 8e5d8a0972..ad40fadd02 100644
--- a/src/lib/libssl/ssl_sess.c
+++ b/src/lib/libssl/ssl_sess.c
@@ -217,6 +217,9 @@ SSL_SESSION *SSL_SESSION_new(void)
 #ifndef OPENSSL_NO_PSK
 	ss->psk_identity_hint=NULL;
 	ss->psk_identity=NULL;
+#endif
+#ifndef OPENSSL_NO_SRP
+	ss->srp_username=NULL;
 #endif
 	return(ss);
 	}
@@ -228,6 +231,11 @@ const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s, unsigned int *len)
 	return s->session_id;
 	}
 
+unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s)
+	{
+	return s->compress_meth;
+	}
+
 /* Even with SSLv2, we have 16 bytes (128 bits) of session ID space. SSLv3/TLSv1
  * has 32 bytes (256 bits). As such, filling the ID with random gunk repeatedly
  * until we have no conflict is going to complete in one iteration pretty much
@@ -300,6 +308,16 @@ int ssl_get_new_session(SSL *s, int session)
 			ss->ssl_version=TLS1_VERSION;
 			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
 			}
+		else if (s->version == TLS1_1_VERSION)
+			{
+			ss->ssl_version=TLS1_1_VERSION;
+			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
+			}
+		else if (s->version == TLS1_2_VERSION)
+			{
+			ss->ssl_version=TLS1_2_VERSION;
+			ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH;
+			}
 		else if (s->version == DTLS1_BAD_VER)
 			{
 			ss->ssl_version=DTLS1_BAD_VER;
@@ -423,6 +441,25 @@ int ssl_get_new_session(SSL *s, int session)
 	return(1);
 	}
 
+/* ssl_get_prev attempts to find an SSL_SESSION to be used to resume this
+ * connection. It is only called by servers.
+ *
+ *   session_id: points at the session ID in the ClientHello. This code will
+ *       read past the end of this in order to parse out the session ticket
+ *       extension, if any.
+ *   len: the length of the session ID.
+ *   limit: a pointer to the first byte after the ClientHello.
+ *
+ * Returns:
+ *   -1: error
+ *    0: a session may have been found.
+ *
+ * Side effects:
+ *   - If a session is found then s->session is pointed at it (after freeing an
+ *     existing session if need be) and s->verify_result is set from the session.
+ *   - Both for new and resumed sessions, s->tlsext_ticket_expected is set to 1
+ *     if the server should issue a new session ticket (to 0 otherwise).
+ */
 int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 			const unsigned char *limit)
 	{
@@ -430,27 +467,39 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 
 	SSL_SESSION *ret=NULL;
 	int fatal = 0;
+	int try_session_cache = 1;
 #ifndef OPENSSL_NO_TLSEXT
 	int r;
 #endif
 
 	if (len > SSL_MAX_SSL_SESSION_ID_LENGTH)
 		goto err;
+
+	if (len == 0)
+		try_session_cache = 0;
+
 #ifndef OPENSSL_NO_TLSEXT
-	r = tls1_process_ticket(s, session_id, len, limit, &ret);
-	if (r == -1)
+	r = tls1_process_ticket(s, session_id, len, limit, &ret); /* sets s->tlsext_ticket_expected */
+	switch (r)
 		{
+	case -1: /* Error during processing */
 		fatal = 1;
 		goto err;
+	case 0: /* No ticket found */
+	case 1: /* Zero length ticket found */
+		break; /* Ok to carry on processing session id. */
+	case 2: /* Ticket found but not decrypted. */
+	case 3: /* Ticket decrypted, *ret has been set. */
+		try_session_cache = 0;
+		break;
+	default:
+		abort();
 		}
-	else if (r == 0 || (!ret && !len))
-		goto err;
-	else if (!ret && !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
-#else
-	if (len == 0)
-		goto err;
-	if (!(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
 #endif
+
+	if (try_session_cache &&
+	    ret == NULL &&
+	    !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP))
 		{
 		SSL_SESSION data;
 		data.ssl_version=s->version;
@@ -461,20 +510,22 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 		CRYPTO_r_lock(CRYPTO_LOCK_SSL_CTX);
 		ret=lh_SSL_SESSION_retrieve(s->session_ctx->sessions,&data);
 		if (ret != NULL)
-		    /* don't allow other threads to steal it: */
-		    CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
+			{
+			/* don't allow other threads to steal it: */
+			CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
+			}
 		CRYPTO_r_unlock(CRYPTO_LOCK_SSL_CTX);
+		if (ret == NULL)
+			s->session_ctx->stats.sess_miss++;
 		}
 
-	if (ret == NULL)
+	if (try_session_cache &&
+	    ret == NULL &&
+	    s->session_ctx->get_session_cb != NULL)
 		{
 		int copy=1;
 	
-		s->session_ctx->stats.sess_miss++;
-		ret=NULL;
-		if (s->session_ctx->get_session_cb != NULL
-		    && (ret=s->session_ctx->get_session_cb(s,session_id,len,&copy))
-		       != NULL)
+		if ((ret=s->session_ctx->get_session_cb(s,session_id,len,&copy)))
 			{
 			s->session_ctx->stats.sess_cb_hit++;
 
@@ -493,23 +544,18 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 				 * things are very strange */
 				SSL_CTX_add_session(s->session_ctx,ret);
 			}
-		if (ret == NULL)
-			goto err;
 		}
 
-	/* Now ret is non-NULL, and we own one of its reference counts. */
+	if (ret == NULL)
+		goto err;
+
+	/* Now ret is non-NULL and we own one of its reference counts. */
 
 	if (ret->sid_ctx_length != s->sid_ctx_length
 	    || memcmp(ret->sid_ctx,s->sid_ctx,ret->sid_ctx_length))
 		{
-		/* We've found the session named by the client, but we don't
+		/* We have the session requested by the client, but we don't
 		 * want to use it in this context. */
-
-#if 0 /* The client cannot always know when a session is not appropriate,
-       * so we shouldn't generate an error message. */
-
-		SSLerr(SSL_F_SSL_GET_PREV_SESSION,SSL_R_ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT);
-#endif
 		goto err; /* treat like cache miss */
 		}
 	
@@ -546,39 +592,38 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len,
 			goto err;
 		}
 
-
-#if 0 /* This is way too late. */
-
-	/* If a thread got the session, then 'swaped', and another got
-	 * it and then due to a time-out decided to 'OPENSSL_free' it we could
-	 * be in trouble.  So I'll increment it now, then double decrement
-	 * later - am I speaking rubbish?. */
-	CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION);
-#endif
-
 	if (ret->timeout < (long)(time(NULL) - ret->time)) /* timeout */
 		{
 		s->session_ctx->stats.sess_timeout++;
-		/* remove it from the cache */
-		SSL_CTX_remove_session(s->session_ctx,ret);
+		if (try_session_cache)
+			{
+			/* session was from the cache, so remove it */
+			SSL_CTX_remove_session(s->session_ctx,ret);
+			}
 		goto err;
 		}
 
 	s->session_ctx->stats.sess_hit++;
 
-	/* ret->time=time(NULL); */ /* rezero timeout? */
-	/* again, just leave the session 
-	 * if it is the same session, we have just incremented and
-	 * then decremented the reference count :-) */
 	if (s->session != NULL)
 		SSL_SESSION_free(s->session);
 	s->session=ret;
 	s->verify_result = s->session->verify_result;
-	return(1);
+	return 1;
 
  err:
 	if (ret != NULL)
+		{
 		SSL_SESSION_free(ret);
+#ifndef OPENSSL_NO_TLSEXT
+		if (!try_session_cache)
+			{
+			/* The session was from a ticket, so we should
+			 * issue a ticket for the new session */
+			s->tlsext_ticket_expected = 1;
+			}
+#endif
+		}
 	if (fatal)
 		return -1;
 	else
@@ -728,6 +773,10 @@ void SSL_SESSION_free(SSL_SESSION *ss)
 		OPENSSL_free(ss->psk_identity_hint);
 	if (ss->psk_identity != NULL)
 		OPENSSL_free(ss->psk_identity);
+#endif
+#ifndef OPENSSL_NO_SRP
+	if (ss->srp_username != NULL)
+		OPENSSL_free(ss->srp_username);
 #endif
 	OPENSSL_cleanse(ss,sizeof(*ss));
 	OPENSSL_free(ss);
@@ -753,10 +802,6 @@ int SSL_set_session(SSL *s, SSL_SESSION *session)
 			{
 			if (!SSL_set_ssl_method(s,meth))
 				return(0);
-			if (s->ctx->session_timeout == 0)
-				session->timeout=SSL_get_default_timeout(s);
-			else
-				session->timeout=s->ctx->session_timeout;
 			}
 
 #ifndef OPENSSL_NO_KRB5
@@ -824,6 +869,25 @@ long SSL_SESSION_set_time(SSL_SESSION *s, long t)
 	return(t);
 	}
 
+X509 *SSL_SESSION_get0_peer(SSL_SESSION *s)
+	{
+	return s->peer;
+	}
+
+int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx,
+			       unsigned int sid_ctx_len)
+	{
+	if(sid_ctx_len > SSL_MAX_SID_CTX_LENGTH)
+		{
+		SSLerr(SSL_F_SSL_SESSION_SET1_ID_CONTEXT,SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG);
+		return 0;
+		}
+	s->sid_ctx_length=sid_ctx_len;
+	memcpy(s->sid_ctx,sid_ctx,sid_ctx_len);
+
+	return 1;
+	}
+
 long SSL_CTX_set_timeout(SSL_CTX *s, long t)
 	{
 	long l;
diff --git a/src/lib/libssl/ssl_txt.c b/src/lib/libssl/ssl_txt.c
index 3122440e26..6479d52c0c 100644
--- a/src/lib/libssl/ssl_txt.c
+++ b/src/lib/libssl/ssl_txt.c
@@ -115,6 +115,10 @@ int SSL_SESSION_print(BIO *bp, const SSL_SESSION *x)
 		s="SSLv2";
 	else if (x->ssl_version == SSL3_VERSION)
 		s="SSLv3";
+	else if (x->ssl_version == TLS1_2_VERSION)
+		s="TLSv1.2";
+	else if (x->ssl_version == TLS1_1_VERSION)
+		s="TLSv1.1";
 	else if (x->ssl_version == TLS1_VERSION)
 		s="TLSv1";
 	else if (x->ssl_version == DTLS1_VERSION)
@@ -187,6 +191,10 @@ int SSL_SESSION_print(BIO *bp, const SSL_SESSION *x)
 	if (BIO_puts(bp,"\n    PSK identity hint: ") <= 0) goto err;
 	if (BIO_printf(bp, "%s", x->psk_identity_hint ? x->psk_identity_hint : "None") <= 0) goto err;
 #endif
+#ifndef OPENSSL_NO_SRP
+	if (BIO_puts(bp,"\n    SRP username: ") <= 0) goto err;
+	if (BIO_printf(bp, "%s", x->srp_username ? x->srp_username : "None") <= 0) goto err;
+#endif
 #ifndef OPENSSL_NO_TLSEXT
 	if (x->tlsext_tick_lifetime_hint)
 		{
diff --git a/src/lib/libssl/t1_clnt.c b/src/lib/libssl/t1_clnt.c
index c87af17712..578617ed84 100644
--- a/src/lib/libssl/t1_clnt.c
+++ b/src/lib/libssl/t1_clnt.c
@@ -66,13 +66,26 @@
 static const SSL_METHOD *tls1_get_client_method(int ver);
 static const SSL_METHOD *tls1_get_client_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_client_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_client_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_client_method());
-	else
-		return(NULL);
+		return TLSv1_client_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_client_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_client_method,
+			ssl_undefined_function,
+			ssl3_connect,
+			tls1_get_client_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_client_method,
+			ssl_undefined_function,
+			ssl3_connect,
+			tls1_get_client_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_client_method,
 			ssl_undefined_function,
 			ssl3_connect,
 			tls1_get_client_method)
diff --git a/src/lib/libssl/t1_enc.c b/src/lib/libssl/t1_enc.c
index 793ea43e90..f7bdeb3b9d 100644
--- a/src/lib/libssl/t1_enc.c
+++ b/src/lib/libssl/t1_enc.c
@@ -143,6 +143,7 @@
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
 #include <openssl/md5.h>
+#include <openssl/rand.h>
 #ifdef KSSL_DEBUG
 #include <openssl/des.h>
 #endif
@@ -158,68 +159,75 @@ static int tls1_P_hash(const EVP_MD *md, const unsigned char *sec,
 			unsigned char *out, int olen)
 	{
 	int chunk;
-	unsigned int j;
-	HMAC_CTX ctx;
-	HMAC_CTX ctx_tmp;
+	size_t j;
+	EVP_MD_CTX ctx, ctx_tmp;
+	EVP_PKEY *mac_key;
 	unsigned char A1[EVP_MAX_MD_SIZE];
-	unsigned int A1_len;
+	size_t A1_len;
 	int ret = 0;
 	
 	chunk=EVP_MD_size(md);
 	OPENSSL_assert(chunk >= 0);
 
-	HMAC_CTX_init(&ctx);
-	HMAC_CTX_init(&ctx_tmp);
-	if (!HMAC_Init_ex(&ctx,sec,sec_len,md, NULL))
+	EVP_MD_CTX_init(&ctx);
+	EVP_MD_CTX_init(&ctx_tmp);
+	EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	EVP_MD_CTX_set_flags(&ctx_tmp, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW);
+	mac_key = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, NULL, sec, sec_len);
+	if (!mac_key)
+		goto err;
+	if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key))
 		goto err;
-	if (!HMAC_Init_ex(&ctx_tmp,sec,sec_len,md, NULL))
+	if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key))
 		goto err;
-	if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len))
+	if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len))
 		goto err;
-	if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len))
+	if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len))
 		goto err;
-	if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len))
+	if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len))
 		goto err;
-	if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len))
+	if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len))
 		goto err;
-	if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len))
+	if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len))
 		goto err;
-	if (!HMAC_Final(&ctx,A1,&A1_len))
+	if (!EVP_DigestSignFinal(&ctx,A1,&A1_len))
 		goto err;
 
 	for (;;)
 		{
-		if (!HMAC_Init_ex(&ctx,NULL,0,NULL,NULL)) /* re-init */
+		/* Reinit mac contexts */
+		if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key))
 			goto err;
-		if (!HMAC_Init_ex(&ctx_tmp,NULL,0,NULL,NULL)) /* re-init */
+		if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key))
 			goto err;
-		if (!HMAC_Update(&ctx,A1,A1_len))
+		if (!EVP_DigestSignUpdate(&ctx,A1,A1_len))
 			goto err;
-		if (!HMAC_Update(&ctx_tmp,A1,A1_len))
+		if (!EVP_DigestSignUpdate(&ctx_tmp,A1,A1_len))
 			goto err;
-		if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len))
+		if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len))
 			goto err;
-		if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len))
+		if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len))
 			goto err;
-		if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len))
+		if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len))
 			goto err;
-		if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len))
+		if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len))
 			goto err;
-		if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len))
+		if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len))
 			goto err;
 
 		if (olen > chunk)
 			{
-			if (!HMAC_Final(&ctx,out,&j))
+			if (!EVP_DigestSignFinal(&ctx,out,&j))
 				goto err;
 			out+=j;
 			olen-=j;
-			if (!HMAC_Final(&ctx_tmp,A1,&A1_len)) /* calc the next A1 value */
+			/* calc the next A1 value */
+			if (!EVP_DigestSignFinal(&ctx_tmp,A1,&A1_len))
 				goto err;
 			}
 		else	/* last one */
 			{
-			if (!HMAC_Final(&ctx,A1,&A1_len))
+			if (!EVP_DigestSignFinal(&ctx,A1,&A1_len))
 				goto err;
 			memcpy(out,A1,olen);
 			break;
@@ -227,8 +235,9 @@ static int tls1_P_hash(const EVP_MD *md, const unsigned char *sec,
 		}
 	ret = 1;
 err:
-	HMAC_CTX_cleanup(&ctx);
-	HMAC_CTX_cleanup(&ctx_tmp);
+	EVP_PKEY_free(mac_key);
+	EVP_MD_CTX_cleanup(&ctx);
+	EVP_MD_CTX_cleanup(&ctx_tmp);
 	OPENSSL_cleanse(A1,sizeof(A1));
 	return ret;
 	}
@@ -256,6 +265,8 @@ static int tls1_PRF(long digest_mask,
 		if ((m<<TLS1_PRF_DGST_SHIFT) & digest_mask) count++;
 	}	
 	len=slen/count;
+	if (count == 1)
+		slen = 0;
 	S1=sec;
 	memset(out1,0,olen);
 	for (idx=0;ssl_get_handshake_digest(idx,&m,&md);idx++) {
@@ -284,7 +295,7 @@ static int tls1_generate_key_block(SSL *s, unsigned char *km,
 	     unsigned char *tmp, int num)
 	{
 	int ret;
-	ret = tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	ret = tls1_PRF(ssl_get_algorithm2(s),
 		 TLS_MD_KEY_EXPANSION_CONST,TLS_MD_KEY_EXPANSION_CONST_SIZE,
 		 s->s3->server_random,SSL3_RANDOM_SIZE,
 		 s->s3->client_random,SSL3_RANDOM_SIZE,
@@ -358,7 +369,7 @@ int tls1_change_cipher_state(SSL *s, int which)
 		{
 		if (s->s3->tmp.new_cipher->algorithm2 & TLS1_STREAM_MAC)
 			s->mac_flags |= SSL_MAC_FLAG_READ_MAC_STREAM;
-			else
+		else
 			s->mac_flags &= ~SSL_MAC_FLAG_READ_MAC_STREAM;
 
 		if (s->enc_read_ctx != NULL)
@@ -445,7 +456,11 @@ int tls1_change_cipher_state(SSL *s, int which)
 	j=is_export ? (cl < SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher) ?
 	               cl : SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher)) : cl;
 	/* Was j=(exp)?5:EVP_CIPHER_key_length(c); */
-	k=EVP_CIPHER_iv_length(c);
+	/* If GCM mode only part of IV comes from PRF */
+	if (EVP_CIPHER_mode(c) == EVP_CIPH_GCM_MODE)
+		k = EVP_GCM_TLS_FIXED_IV_LEN;
+	else
+		k=EVP_CIPHER_iv_length(c);
 	if (	(which == SSL3_CHANGE_CIPHER_CLIENT_WRITE) ||
 		(which == SSL3_CHANGE_CIPHER_SERVER_READ))
 		{
@@ -474,10 +489,14 @@ int tls1_change_cipher_state(SSL *s, int which)
 		}
 
 	memcpy(mac_secret,ms,i);
-	mac_key = EVP_PKEY_new_mac_key(mac_type, NULL,
-			mac_secret,*mac_secret_size);
-	EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key);
-	EVP_PKEY_free(mac_key);
+
+	if (!(EVP_CIPHER_flags(c)&EVP_CIPH_FLAG_AEAD_CIPHER))
+		{
+		mac_key = EVP_PKEY_new_mac_key(mac_type, NULL,
+				mac_secret,*mac_secret_size);
+		EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key);
+		EVP_PKEY_free(mac_key);
+		}
 #ifdef TLS_DEBUG
 printf("which = %04X\nmac key=",which);
 { int z; for (z=0; z<i; z++) printf("%02X%c",ms[z],((z+1)%16)?' ':'\n'); }
@@ -487,7 +506,7 @@ printf("which = %04X\nmac key=",which);
 		/* In here I set both the read and write key/iv to the
 		 * same value since only the correct one will be used :-).
 		 */
-		if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+		if (!tls1_PRF(ssl_get_algorithm2(s),
 				exp_label,exp_label_len,
 				s->s3->client_random,SSL3_RANDOM_SIZE,
 				s->s3->server_random,SSL3_RANDOM_SIZE,
@@ -498,7 +517,7 @@ printf("which = %04X\nmac key=",which);
 
 		if (k > 0)
 			{
-			if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+			if (!tls1_PRF(ssl_get_algorithm2(s),
 					TLS_MD_IV_BLOCK_CONST,TLS_MD_IV_BLOCK_CONST_SIZE,
 					s->s3->client_random,SSL3_RANDOM_SIZE,
 					s->s3->server_random,SSL3_RANDOM_SIZE,
@@ -524,7 +543,19 @@ printf("which = %04X\nmac key=",which);
 	}
 #endif	/* KSSL_DEBUG */
 
-	EVP_CipherInit_ex(dd,c,NULL,key,iv,(which & SSL3_CC_WRITE));
+	if (EVP_CIPHER_mode(c) == EVP_CIPH_GCM_MODE)
+		{
+		EVP_CipherInit_ex(dd,c,NULL,key,NULL,(which & SSL3_CC_WRITE));
+		EVP_CIPHER_CTX_ctrl(dd, EVP_CTRL_GCM_SET_IV_FIXED, k, iv);
+		}
+	else	
+		EVP_CipherInit_ex(dd,c,NULL,key,iv,(which & SSL3_CC_WRITE));
+
+	/* Needed for "composite" AEADs, such as RC4-HMAC-MD5 */
+	if ((EVP_CIPHER_flags(c)&EVP_CIPH_FLAG_AEAD_CIPHER) && *mac_secret_size)
+		EVP_CIPHER_CTX_ctrl(dd,EVP_CTRL_AEAD_SET_MAC_KEY,
+				*mac_secret_size,mac_secret);
+
 #ifdef TLS_DEBUG
 printf("which = %04X\nkey=",which);
 { int z; for (z=0; z<EVP_CIPHER_key_length(c); z++) printf("%02X%c",key[z],((z+1)%16)?' ':'\n'); }
@@ -606,7 +637,8 @@ printf("\nkey block\n");
 { int z; for (z=0; z<num; z++) printf("%02X%c",p1[z],((z+1)%16)?' ':'\n'); }
 #endif
 
-	if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS))
+	if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS)
+		&& s->method->version <= TLS1_VERSION)
 		{
 		/* enable vulnerability countermeasure for CBC ciphers with
 		 * known-IV problem (http://www.openssl.org/~bodo/tls-cbc.txt)
@@ -640,14 +672,14 @@ int tls1_enc(SSL *s, int send)
 	SSL3_RECORD *rec;
 	EVP_CIPHER_CTX *ds;
 	unsigned long l;
-	int bs,i,ii,j,k,n=0;
+	int bs,i,ii,j,k,pad=0;
 	const EVP_CIPHER *enc;
 
 	if (send)
 		{
 		if (EVP_MD_CTX_md(s->write_hash))
 			{
-			n=EVP_MD_CTX_size(s->write_hash);
+			int n=EVP_MD_CTX_size(s->write_hash);
 			OPENSSL_assert(n >= 0);
 			}
 		ds=s->enc_write_ctx;
@@ -655,13 +687,34 @@ int tls1_enc(SSL *s, int send)
 		if (s->enc_write_ctx == NULL)
 			enc=NULL;
 		else
+			{
+			int ivlen;
 			enc=EVP_CIPHER_CTX_cipher(s->enc_write_ctx);
+			/* For TLSv1.1 and later explicit IV */
+			if (s->version >= TLS1_1_VERSION
+				&& EVP_CIPHER_mode(enc) == EVP_CIPH_CBC_MODE)
+				ivlen = EVP_CIPHER_iv_length(enc);
+			else
+				ivlen = 0;
+			if (ivlen > 1)
+				{
+				if ( rec->data != rec->input)
+					/* we can't write into the input stream:
+					 * Can this ever happen?? (steve)
+					 */
+					fprintf(stderr,
+						"%s:%d: rec->data != rec->input\n",
+						__FILE__, __LINE__);
+				else if (RAND_bytes(rec->input, ivlen) <= 0)
+					return -1;
+				}
+			}
 		}
 	else
 		{
 		if (EVP_MD_CTX_md(s->read_hash))
 			{
-			n=EVP_MD_CTX_size(s->read_hash);
+			int n=EVP_MD_CTX_size(s->read_hash);
 			OPENSSL_assert(n >= 0);
 			}
 		ds=s->enc_read_ctx;
@@ -687,7 +740,43 @@ int tls1_enc(SSL *s, int send)
 		l=rec->length;
 		bs=EVP_CIPHER_block_size(ds->cipher);
 
-		if ((bs != 1) && send)
+		if (EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_AEAD_CIPHER)
+			{
+			unsigned char buf[13],*seq;
+
+			seq = send?s->s3->write_sequence:s->s3->read_sequence;
+
+			if (s->version == DTLS1_VERSION || s->version == DTLS1_BAD_VER)
+				{
+				unsigned char dtlsseq[9],*p=dtlsseq;
+
+				s2n(send?s->d1->w_epoch:s->d1->r_epoch,p);
+				memcpy(p,&seq[2],6);
+				memcpy(buf,dtlsseq,8);
+				}
+			else
+				{
+				memcpy(buf,seq,8);
+				for (i=7; i>=0; i--)	/* increment */
+					{
+					++seq[i];
+					if (seq[i] != 0) break; 
+					}
+				}
+
+			buf[8]=rec->type;
+			buf[9]=(unsigned char)(s->version>>8);
+			buf[10]=(unsigned char)(s->version);
+			buf[11]=rec->length>>8;
+			buf[12]=rec->length&0xff;
+			pad=EVP_CIPHER_CTX_ctrl(ds,EVP_CTRL_AEAD_TLS1_AAD,13,buf);
+			if (send)
+				{
+				l+=pad;
+				rec->length+=pad;
+				}
+			}
+		else if ((bs != 1) && send)
 			{
 			i=bs-((int)l%bs);
 
@@ -728,13 +817,25 @@ int tls1_enc(SSL *s, int send)
 			{
 			if (l == 0 || l%bs != 0)
 				{
+				if (s->version >= TLS1_1_VERSION)
+					return -1;
 				SSLerr(SSL_F_TLS1_ENC,SSL_R_BLOCK_CIPHER_PAD_IS_WRONG);
 				ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECRYPTION_FAILED);
 				return 0;
 				}
 			}
 		
-		EVP_Cipher(ds,rec->data,rec->input,l);
+		i = EVP_Cipher(ds,rec->data,rec->input,l);
+		if ((EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_CUSTOM_CIPHER)
+						?(i<0)
+						:(i==0))
+			return -1;	/* AEAD can fail to verify MAC */
+		if (EVP_CIPHER_mode(enc) == EVP_CIPH_GCM_MODE && !send)
+			{
+			rec->data += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			rec->input += EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			rec->length -= EVP_GCM_TLS_EXPLICIT_IV_LEN;
+			}
 
 #ifdef KSSL_DEBUG
 		{
@@ -784,8 +885,19 @@ int tls1_enc(SSL *s, int send)
 					return -1;
 					}
 				}
-			rec->length-=i;
+			rec->length -=i;
+			if (s->version >= TLS1_1_VERSION
+				&& EVP_CIPHER_CTX_mode(ds) == EVP_CIPH_CBC_MODE)
+				{
+				if (bs > (int)rec->length)
+					return -1;
+				rec->data += bs;    /* skip the explicit IV */
+				rec->input += bs;
+				rec->length -= bs;
+				}
 			}
+		if (pad && !send)
+			rec->length -= pad;
 		}
 	return(1);
 	}
@@ -841,7 +953,7 @@ int tls1_final_finish_mac(SSL *s,
 
 	for (idx=0;ssl_get_handshake_digest(idx,&mask,&md);idx++)
 		{
-		if (mask & s->s3->tmp.new_cipher->algorithm2)
+		if (mask & ssl_get_algorithm2(s))
 			{
 			int hashsize = EVP_MD_size(md);
 			if (hashsize < 0 || hashsize > (int)(sizeof buf - (size_t)(q-buf)))
@@ -860,7 +972,7 @@ int tls1_final_finish_mac(SSL *s,
 			}
 		}
 		
-	if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	if (!tls1_PRF(ssl_get_algorithm2(s),
 			str,slen, buf,(int)(q-buf), NULL,0, NULL,0, NULL,0,
 			s->session->master_key,s->session->master_key_length,
 			out,buf2,sizeof buf2))
@@ -970,6 +1082,7 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 	const void *co = NULL, *so = NULL;
 	int col = 0, sol = 0;
 
+
 #ifdef KSSL_DEBUG
 	printf ("tls1_generate_master_secret(%p,%p, %p, %d)\n", s,out, p,len);
 #endif	/* KSSL_DEBUG */
@@ -986,7 +1099,7 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 		}
 #endif
 
-	tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+	tls1_PRF(ssl_get_algorithm2(s),
 		TLS_MD_MASTER_SECRET_CONST,TLS_MD_MASTER_SECRET_CONST_SIZE,
 		s->s3->client_random,SSL3_RANDOM_SIZE,
 		co, col,
@@ -994,6 +1107,16 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 		so, sol,
 		p,len,
 		s->session->master_key,buff,sizeof buff);
+#ifdef SSL_DEBUG
+	fprintf(stderr, "Premaster Secret:\n");
+	BIO_dump_fp(stderr, (char *)p, len);
+	fprintf(stderr, "Client Random:\n");
+	BIO_dump_fp(stderr, (char *)s->s3->client_random, SSL3_RANDOM_SIZE);
+	fprintf(stderr, "Server Random:\n");
+	BIO_dump_fp(stderr, (char *)s->s3->server_random, SSL3_RANDOM_SIZE);
+	fprintf(stderr, "Master Secret:\n");
+	BIO_dump_fp(stderr, (char *)s->session->master_key, SSL3_MASTER_SECRET_SIZE);
+#endif
 
 #ifdef KSSL_DEBUG
 	printf ("tls1_generate_master_secret() complete\n");
@@ -1001,6 +1124,95 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p,
 	return(SSL3_MASTER_SECRET_SIZE);
 	}
 
+int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	 const char *label, size_t llen, const unsigned char *context,
+	 size_t contextlen, int use_context)
+	{
+	unsigned char *buff;
+	unsigned char *val = NULL;
+	size_t vallen, currentvalpos;
+	int rv;
+
+#ifdef KSSL_DEBUG
+	printf ("tls1_export_keying_material(%p,%p,%d,%s,%d,%p,%d)\n", s, out, olen, label, llen, p, plen);
+#endif	/* KSSL_DEBUG */
+
+	buff = OPENSSL_malloc(olen);
+	if (buff == NULL) goto err2;
+
+	/* construct PRF arguments
+	 * we construct the PRF argument ourself rather than passing separate
+	 * values into the TLS PRF to ensure that the concatenation of values
+	 * does not create a prohibited label.
+	 */
+	vallen = llen + SSL3_RANDOM_SIZE * 2;
+	if (use_context)
+		{
+		vallen += 2 + contextlen;
+		}
+
+	val = OPENSSL_malloc(vallen);
+	if (val == NULL) goto err2;
+	currentvalpos = 0;
+	memcpy(val + currentvalpos, (unsigned char *) label, llen);
+	currentvalpos += llen;
+	memcpy(val + currentvalpos, s->s3->client_random, SSL3_RANDOM_SIZE);
+	currentvalpos += SSL3_RANDOM_SIZE;
+	memcpy(val + currentvalpos, s->s3->server_random, SSL3_RANDOM_SIZE);
+	currentvalpos += SSL3_RANDOM_SIZE;
+
+	if (use_context)
+		{
+		val[currentvalpos] = (contextlen >> 8) & 0xff;
+		currentvalpos++;
+		val[currentvalpos] = contextlen & 0xff;
+		currentvalpos++;
+		if ((contextlen > 0) || (context != NULL))
+			{
+			memcpy(val + currentvalpos, context, contextlen);
+			}
+		}
+
+	/* disallow prohibited labels
+	 * note that SSL3_RANDOM_SIZE > max(prohibited label len) =
+	 * 15, so size of val > max(prohibited label len) = 15 and the
+	 * comparisons won't have buffer overflow
+	 */
+	if (memcmp(val, TLS_MD_CLIENT_FINISH_CONST,
+		 TLS_MD_CLIENT_FINISH_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_SERVER_FINISH_CONST,
+		 TLS_MD_SERVER_FINISH_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_MASTER_SECRET_CONST,
+		 TLS_MD_MASTER_SECRET_CONST_SIZE) == 0) goto err1;
+	if (memcmp(val, TLS_MD_KEY_EXPANSION_CONST,
+		 TLS_MD_KEY_EXPANSION_CONST_SIZE) == 0) goto err1;
+
+	rv = tls1_PRF(s->s3->tmp.new_cipher->algorithm2,
+		      val, vallen,
+		      NULL, 0,
+		      NULL, 0,
+		      NULL, 0,
+		      NULL, 0,
+		      s->session->master_key,s->session->master_key_length,
+		      out,buff,olen);
+
+#ifdef KSSL_DEBUG
+	printf ("tls1_export_keying_material() complete\n");
+#endif	/* KSSL_DEBUG */
+	goto ret;
+err1:
+	SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, SSL_R_TLS_ILLEGAL_EXPORTER_LABEL);
+	rv = 0;
+	goto ret;
+err2:
+	SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, ERR_R_MALLOC_FAILURE);
+	rv = 0;
+ret:
+	if (buff != NULL) OPENSSL_free(buff);
+	if (val != NULL) OPENSSL_free(val);
+	return(rv);
+	}
+
 int tls1_alert_code(int code)
 	{
 	switch (code)
@@ -1042,4 +1254,3 @@ int tls1_alert_code(int code)
 	default:			return(-1);
 		}
 	}
-
diff --git a/src/lib/libssl/t1_lib.c b/src/lib/libssl/t1_lib.c
index 26cbae449e..27c8e3460d 100644
--- a/src/lib/libssl/t1_lib.c
+++ b/src/lib/libssl/t1_lib.c
@@ -114,6 +114,7 @@
 #include <openssl/evp.h>
 #include <openssl/hmac.h>
 #include <openssl/ocsp.h>
+#include <openssl/rand.h>
 #include "ssl_locl.h"
 
 const char tls1_version_str[]="TLSv1" OPENSSL_VERSION_PTEXT;
@@ -136,6 +137,7 @@ SSL3_ENC_METHOD TLSv1_enc_data={
 	TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE,
 	TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE,
 	tls1_alert_code,
+	tls1_export_keying_material,
 	};
 
 long tls1_default_timeout(void)
@@ -166,10 +168,11 @@ void tls1_free(SSL *s)
 void tls1_clear(SSL *s)
 	{
 	ssl3_clear(s);
-	s->version=TLS1_VERSION;
+	s->version = s->method->version;
 	}
 
 #ifndef OPENSSL_NO_EC
+
 static int nid_list[] =
 	{
 		NID_sect163k1, /* sect163k1 (1) */
@@ -198,7 +201,36 @@ static int nid_list[] =
 		NID_secp384r1, /* secp384r1 (24) */
 		NID_secp521r1  /* secp521r1 (25) */	
 	};
-	
+
+static int pref_list[] =
+	{
+		NID_sect571r1, /* sect571r1 (14) */ 
+		NID_sect571k1, /* sect571k1 (13) */ 
+		NID_secp521r1, /* secp521r1 (25) */	
+		NID_sect409k1, /* sect409k1 (11) */ 
+		NID_sect409r1, /* sect409r1 (12) */
+		NID_secp384r1, /* secp384r1 (24) */
+		NID_sect283k1, /* sect283k1 (9) */
+		NID_sect283r1, /* sect283r1 (10) */ 
+		NID_secp256k1, /* secp256k1 (22) */ 
+		NID_X9_62_prime256v1, /* secp256r1 (23) */ 
+		NID_sect239k1, /* sect239k1 (8) */ 
+		NID_sect233k1, /* sect233k1 (6) */
+		NID_sect233r1, /* sect233r1 (7) */ 
+		NID_secp224k1, /* secp224k1 (20) */ 
+		NID_secp224r1, /* secp224r1 (21) */
+		NID_sect193r1, /* sect193r1 (4) */ 
+		NID_sect193r2, /* sect193r2 (5) */ 
+		NID_secp192k1, /* secp192k1 (18) */
+		NID_X9_62_prime192v1, /* secp192r1 (19) */ 
+		NID_sect163k1, /* sect163k1 (1) */
+		NID_sect163r1, /* sect163r1 (2) */
+		NID_sect163r2, /* sect163r2 (3) */
+		NID_secp160k1, /* secp160k1 (15) */
+		NID_secp160r1, /* secp160r1 (16) */ 
+		NID_secp160r2, /* secp160r2 (17) */ 
+	};
+
 int tls1_ec_curve_id2nid(int curve_id)
 	{
 	/* ECC curves from draft-ietf-tls-ecc-12.txt (Oct. 17, 2005) */
@@ -270,6 +302,64 @@ int tls1_ec_nid2curve_id(int nid)
 #endif /* OPENSSL_NO_EC */
 
 #ifndef OPENSSL_NO_TLSEXT
+
+/* List of supported signature algorithms and hashes. Should make this
+ * customisable at some point, for now include everything we support.
+ */
+
+#ifdef OPENSSL_NO_RSA
+#define tlsext_sigalg_rsa(md) /* */
+#else
+#define tlsext_sigalg_rsa(md) md, TLSEXT_signature_rsa,
+#endif
+
+#ifdef OPENSSL_NO_DSA
+#define tlsext_sigalg_dsa(md) /* */
+#else
+#define tlsext_sigalg_dsa(md) md, TLSEXT_signature_dsa,
+#endif
+
+#ifdef OPENSSL_NO_ECDSA
+#define tlsext_sigalg_ecdsa(md) /* */
+#else
+#define tlsext_sigalg_ecdsa(md) md, TLSEXT_signature_ecdsa,
+#endif
+
+#define tlsext_sigalg(md) \
+		tlsext_sigalg_rsa(md) \
+		tlsext_sigalg_dsa(md) \
+		tlsext_sigalg_ecdsa(md)
+
+static unsigned char tls12_sigalgs[] = {
+#ifndef OPENSSL_NO_SHA512
+	tlsext_sigalg(TLSEXT_hash_sha512)
+	tlsext_sigalg(TLSEXT_hash_sha384)
+#endif
+#ifndef OPENSSL_NO_SHA256
+	tlsext_sigalg(TLSEXT_hash_sha256)
+	tlsext_sigalg(TLSEXT_hash_sha224)
+#endif
+#ifndef OPENSSL_NO_SHA
+	tlsext_sigalg(TLSEXT_hash_sha1)
+#endif
+#ifndef OPENSSL_NO_MD5
+	tlsext_sigalg_rsa(TLSEXT_hash_md5)
+#endif
+};
+
+int tls12_get_req_sig_algs(SSL *s, unsigned char *p)
+	{
+	size_t slen = sizeof(tls12_sigalgs);
+#ifdef OPENSSL_FIPS
+	/* If FIPS mode don't include MD5 which is last */
+	if (FIPS_mode())
+		slen -= 2;
+#endif
+	if (p)
+		memcpy(p, tls12_sigalgs, slen);
+	return (int)slen;
+	}
+
 unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned char *limit)
 	{
 	int extdatalen=0;
@@ -317,7 +407,7 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		}
 
         /* Add RI if renegotiating */
-        if (s->new_session)
+        if (s->renegotiate)
           {
           int el;
           
@@ -341,6 +431,34 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
           ret += el;
         }
 
+#ifndef OPENSSL_NO_SRP
+	/* Add SRP username if there is one */
+	if (s->srp_ctx.login != NULL)
+		{ /* Add TLS extension SRP username to the Client Hello message */
+
+		int login_len = strlen(s->srp_ctx.login);	
+		if (login_len > 255 || login_len == 0)
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			} 
+
+		/* check for enough space.
+		   4 for the srp type type and entension length
+		   1 for the srp user identity
+		   + srp user identity length 
+		*/
+		if ((limit - ret - 5 - login_len) < 0) return NULL; 
+
+		/* fill in the extension */
+		s2n(TLSEXT_TYPE_srp,ret);
+		s2n(login_len+1,ret);
+		(*ret++) = (unsigned char) login_len;
+		memcpy(ret, s->srp_ctx.login, login_len);
+		ret+=login_len;
+		}
+#endif
+
 #ifndef OPENSSL_NO_EC
 	if (s->tlsext_ecpointformatlist != NULL &&
 	    s->version != DTLS1_VERSION)
@@ -426,6 +544,17 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		}
 		skip_ext:
 
+	if (TLS1_get_client_version(s) >= TLS1_2_VERSION)
+		{
+		if ((size_t)(limit - ret) < sizeof(tls12_sigalgs) + 6)
+			return NULL; 
+		s2n(TLSEXT_TYPE_signature_algorithms,ret);
+		s2n(sizeof(tls12_sigalgs) + 2, ret);
+		s2n(sizeof(tls12_sigalgs), ret);
+		memcpy(ret, tls12_sigalgs, sizeof(tls12_sigalgs));
+		ret += sizeof(tls12_sigalgs);
+		}
+
 #ifdef TLSEXT_TYPE_opaque_prf_input
 	if (s->s3->client_opaque_prf_input != NULL &&
 	    s->version != DTLS1_VERSION)
@@ -494,6 +623,51 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha
 			i2d_X509_EXTENSIONS(s->tlsext_ocsp_exts, &ret);
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* Add Heartbeat extension */
+	s2n(TLSEXT_TYPE_heartbeat,ret);
+	s2n(1,ret);
+	/* Set mode:
+	 * 1: peer may send requests
+	 * 2: peer not allowed to send requests
+	 */
+	if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS)
+		*(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+	else
+		*(ret++) = SSL_TLSEXT_HB_ENABLED;
+#endif
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	if (s->ctx->next_proto_select_cb && !s->s3->tmp.finish_md_len)
+		{
+		/* The client advertises an emtpy extension to indicate its
+		 * support for Next Protocol Negotiation */
+		if (limit - ret - 4 < 0)
+			return NULL;
+		s2n(TLSEXT_TYPE_next_proto_neg,ret);
+		s2n(0,ret);
+		}
+#endif
+
+        if(SSL_get_srtp_profiles(s))
+                {
+                int el;
+
+                ssl_add_clienthello_use_srtp_ext(s, 0, &el, 0);
+                
+                if((limit - p - 4 - el) < 0) return NULL;
+
+                s2n(TLSEXT_TYPE_use_srtp,ret);
+                s2n(el,ret);
+
+                if(ssl_add_clienthello_use_srtp_ext(s, ret, &el, el))
+			{
+			SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			}
+                ret += el;
+                }
+
 	if ((extdatalen = ret-p-2)== 0) 
 		return p;
 
@@ -505,6 +679,9 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
 	{
 	int extdatalen=0;
 	unsigned char *ret = p;
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	int next_proto_neg_seen;
+#endif
 
 	/* don't add extensions for SSLv3, unless doing secure renegotiation */
 	if (s->version == SSL3_VERSION && !s->s3->send_connection_binding)
@@ -603,6 +780,26 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
 		ret += sol;
 		}
 #endif
+
+        if(s->srtp_profile)
+                {
+                int el;
+
+                ssl_add_serverhello_use_srtp_ext(s, 0, &el, 0);
+                
+                if((limit - p - 4 - el) < 0) return NULL;
+
+                s2n(TLSEXT_TYPE_use_srtp,ret);
+                s2n(el,ret);
+
+                if(ssl_add_serverhello_use_srtp_ext(s, ret, &el, el))
+			{
+			SSLerr(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT, ERR_R_INTERNAL_ERROR);
+			return NULL;
+			}
+                ret+=el;
+                }
+
 	if (((s->s3->tmp.new_cipher->id & 0xFFFF)==0x80 || (s->s3->tmp.new_cipher->id & 0xFFFF)==0x81) 
 		&& (SSL_get_options(s) & SSL_OP_CRYPTOPRO_TLSEXT_BUG))
 		{ const unsigned char cryptopro_ext[36] = {
@@ -618,6 +815,46 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha
 
 		}
 
+#ifndef OPENSSL_NO_HEARTBEATS
+	/* Add Heartbeat extension if we've received one */
+	if (s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED)
+		{
+		s2n(TLSEXT_TYPE_heartbeat,ret);
+		s2n(1,ret);
+		/* Set mode:
+		 * 1: peer may send requests
+		 * 2: peer not allowed to send requests
+		 */
+		if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS)
+			*(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+		else
+			*(ret++) = SSL_TLSEXT_HB_ENABLED;
+
+		}
+#endif
+
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	next_proto_neg_seen = s->s3->next_proto_neg_seen;
+	s->s3->next_proto_neg_seen = 0;
+	if (next_proto_neg_seen && s->ctx->next_protos_advertised_cb)
+		{
+		const unsigned char *npa;
+		unsigned int npalen;
+		int r;
+
+		r = s->ctx->next_protos_advertised_cb(s, &npa, &npalen, s->ctx->next_protos_advertised_cb_arg);
+		if (r == SSL_TLSEXT_ERR_OK)
+			{
+			if ((long)(limit - ret - 4 - npalen) < 0) return NULL;
+			s2n(TLSEXT_TYPE_next_proto_neg,ret);
+			s2n(npalen,ret);
+			memcpy(ret, npa, npalen);
+			ret += npalen;
+			s->s3->next_proto_neg_seen = 1;
+			}
+		}
+#endif
+
 	if ((extdatalen = ret-p-2)== 0) 
 		return p;
 
@@ -632,9 +869,18 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 	unsigned short len;
 	unsigned char *data = *p;
 	int renegotiate_seen = 0;
+	int sigalg_seen = 0;
 
 	s->servername_done = 0;
 	s->tlsext_status_type = -1;
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	s->s3->next_proto_neg_seen = 0;
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+#endif
 
 	if (data >= (d+n-2))
 		goto ri_check;
@@ -762,6 +1008,31 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				}
 
 			}
+#ifndef OPENSSL_NO_SRP
+		else if (type == TLSEXT_TYPE_srp)
+			{
+			if (size <= 0 || ((len = data[0])) != (size -1))
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if (s->srp_ctx.login != NULL)
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if ((s->srp_ctx.login = OPENSSL_malloc(len+1)) == NULL)
+				return -1;
+			memcpy(s->srp_ctx.login, &data[1], len);
+			s->srp_ctx.login[len]='\0';
+  
+			if (strlen(s->srp_ctx.login) != len) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			}
+#endif
 
 #ifndef OPENSSL_NO_EC
 		else if (type == TLSEXT_TYPE_ec_point_formats &&
@@ -882,6 +1153,28 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				return 0;
 			renegotiate_seen = 1;
 			}
+		else if (type == TLSEXT_TYPE_signature_algorithms)
+			{
+			int dsize;
+			if (sigalg_seen || size < 2) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			sigalg_seen = 1;
+			n2s(data,dsize);
+			size -= 2;
+			if (dsize != size || dsize & 1) 
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			if (!tls1_process_sigalgs(s, data, dsize))
+				{
+				*al = SSL_AD_DECODE_ERROR;
+				return 0;
+				}
+			}
 		else if (type == TLSEXT_TYPE_status_request &&
 		         s->version != DTLS1_VERSION && s->ctx->tlsext_status_cb)
 			{
@@ -994,8 +1287,54 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 				else
 					s->tlsext_status_type = -1;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (type == TLSEXT_TYPE_heartbeat)
+			{
+			switch(data[0])
+				{
+				case 0x01:	/* Client allows us to send HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							break;
+				case 0x02:	/* Client doesn't accept HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+							break;
+				default:	*al = SSL_AD_ILLEGAL_PARAMETER;
+							return 0;
+				}
+			}
+#endif
+#ifndef OPENSSL_NO_NEXTPROTONEG
+		else if (type == TLSEXT_TYPE_next_proto_neg &&
+			 s->s3->tmp.finish_md_len == 0)
+			{
+			/* We shouldn't accept this extension on a
+			 * renegotiation.
+			 *
+			 * s->new_session will be set on renegotiation, but we
+			 * probably shouldn't rely that it couldn't be set on
+			 * the initial renegotation too in certain cases (when
+			 * there's some other reason to disallow resuming an
+			 * earlier session -- the current code won't be doing
+			 * anything like that, but this might change).
+
+			 * A valid sign that there's been a previous handshake
+			 * in this connection is if s->s3->tmp.finish_md_len >
+			 * 0.  (We are talking about a check that will happen
+			 * in the Hello protocol round, well before a new
+			 * Finished message could have been computed.) */
+			s->s3->next_proto_neg_seen = 1;
+			}
+#endif
 
 		/* session ticket processed earlier */
+		else if (type == TLSEXT_TYPE_use_srtp)
+                        {
+			if(ssl_parse_clienthello_use_srtp_ext(s, data, size,
+							      al))
+				return 0;
+                        }
+
 		data+=size;
 		}
 				
@@ -1005,7 +1344,7 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 
 	/* Need RI if renegotiating */
 
-	if (!renegotiate_seen && s->new_session &&
+	if (!renegotiate_seen && s->renegotiate &&
 		!(s->options & SSL_OP_ALLOW_UNSAFE_LEGACY_RENEGOTIATION))
 		{
 		*al = SSL_AD_HANDSHAKE_FAILURE;
@@ -1017,6 +1356,26 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 	return 1;
 	}
 
+#ifndef OPENSSL_NO_NEXTPROTONEG
+/* ssl_next_proto_validate validates a Next Protocol Negotiation block. No
+ * elements of zero length are allowed and the set of elements must exactly fill
+ * the length of the block. */
+static char ssl_next_proto_validate(unsigned char *d, unsigned len)
+	{
+	unsigned int off = 0;
+
+	while (off < len)
+		{
+		if (d[off] == 0)
+			return 0;
+		off += d[off];
+		off++;
+		}
+
+	return off == len;
+	}
+#endif
+
 int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, int n, int *al)
 	{
 	unsigned short length;
@@ -1026,6 +1385,15 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 	int tlsext_servername = 0;
 	int renegotiate_seen = 0;
 
+#ifndef OPENSSL_NO_NEXTPROTONEG
+	s->s3->next_proto_neg_seen = 0;
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+	s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED |
+	                       SSL_TLSEXT_HB_DONT_SEND_REQUESTS);
+#endif
+
 	if (data >= (d+n-2))
 		goto ri_check;
 
@@ -1151,12 +1519,71 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in
 			/* Set flag to expect CertificateStatus message */
 			s->tlsext_status_expected = 1;
 			}
+#ifndef OPENSSL_NO_NEXTPROTONEG
+		else if (type == TLSEXT_TYPE_next_proto_neg &&
+			 s->s3->tmp.finish_md_len == 0)
+			{
+			unsigned char *selected;
+			unsigned char selected_len;
+
+			/* We must have requested it. */
+			if ((s->ctx->next_proto_select_cb == NULL))
+				{
+				*al = TLS1_AD_UNSUPPORTED_EXTENSION;
+				return 0;
+				}
+			/* The data must be valid */
+			if (!ssl_next_proto_validate(data, size))
+				{
+				*al = TLS1_AD_DECODE_ERROR;
+				return 0;
+				}
+			if (s->ctx->next_proto_select_cb(s, &selected, &selected_len, data, size, s->ctx->next_proto_select_cb_arg) != SSL_TLSEXT_ERR_OK)
+				{
+				*al = TLS1_AD_INTERNAL_ERROR;
+				return 0;
+				}
+			s->next_proto_negotiated = OPENSSL_malloc(selected_len);
+			if (!s->next_proto_negotiated)
+				{
+				*al = TLS1_AD_INTERNAL_ERROR;
+				return 0;
+				}
+			memcpy(s->next_proto_negotiated, selected, selected_len);
+			s->next_proto_negotiated_len = selected_len;
+			s->s3->next_proto_neg_seen = 1;
+			}
+#endif
 		else if (type == TLSEXT_TYPE_renegotiate)
 			{
 			if(!ssl_parse_serverhello_renegotiate_ext(s, data, size, al))
 				return 0;
 			renegotiate_seen = 1;
 			}
+#ifndef OPENSSL_NO_HEARTBEATS
+		else if (type == TLSEXT_TYPE_heartbeat)
+			{
+			switch(data[0])
+				{
+				case 0x01:	/* Server allows us to send HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							break;
+				case 0x02:	/* Server doesn't accept HB requests */
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED;
+							s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS;
+							break;
+				default:	*al = SSL_AD_ILLEGAL_PARAMETER;
+							return 0;
+				}
+			}
+#endif
+		else if (type == TLSEXT_TYPE_use_srtp)
+                        {
+                        if(ssl_parse_serverhello_use_srtp_ext(s, data, size,
+							      al))
+                                return 0;
+                        }
+
 		data+=size;		
 		}
 
@@ -1236,7 +1663,7 @@ int ssl_prepare_clienthello_tlsext(SSL *s)
 			break;
 			}
 		}
-	using_ecc = using_ecc && (s->version == TLS1_VERSION);
+	using_ecc = using_ecc && (s->version >= TLS1_VERSION);
 	if (using_ecc)
 		{
 		if (s->tlsext_ecpointformatlist != NULL) OPENSSL_free(s->tlsext_ecpointformatlist);
@@ -1252,16 +1679,19 @@ int ssl_prepare_clienthello_tlsext(SSL *s)
 
 		/* we support all named elliptic curves in draft-ietf-tls-ecc-12 */
 		if (s->tlsext_ellipticcurvelist != NULL) OPENSSL_free(s->tlsext_ellipticcurvelist);
-		s->tlsext_ellipticcurvelist_length = sizeof(nid_list)/sizeof(nid_list[0]) * 2;
+		s->tlsext_ellipticcurvelist_length = sizeof(pref_list)/sizeof(pref_list[0]) * 2;
 		if ((s->tlsext_ellipticcurvelist = OPENSSL_malloc(s->tlsext_ellipticcurvelist_length)) == NULL)
 			{
 			s->tlsext_ellipticcurvelist_length = 0;
 			SSLerr(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT,ERR_R_MALLOC_FAILURE);
 			return -1;
 			}
-		for (i = 1, j = s->tlsext_ellipticcurvelist; (unsigned int)i <=
-				sizeof(nid_list)/sizeof(nid_list[0]); i++)
-			s2n(i,j);
+		for (i = 0, j = s->tlsext_ellipticcurvelist; (unsigned int)i <
+				sizeof(pref_list)/sizeof(pref_list[0]); i++)
+			{
+			int id = tls1_ec_nid2curve_id(pref_list[i]);
+			s2n(id,j);
+			}
 		}
 #endif /* OPENSSL_NO_EC */
 
@@ -1570,26 +2000,56 @@ int ssl_check_serverhello_tlsext(SSL *s)
 		}
 	}
 
-/* Since the server cache lookup is done early on in the processing of client
- * hello and other operations depend on the result we need to handle any TLS
- * session ticket extension at the same time.
+/* Since the server cache lookup is done early on in the processing of the
+ * ClientHello, and other operations depend on the result, we need to handle
+ * any TLS session ticket extension at the same time.
+ *
+ *   session_id: points at the session ID in the ClientHello. This code will
+ *       read past the end of this in order to parse out the session ticket
+ *       extension, if any.
+ *   len: the length of the session ID.
+ *   limit: a pointer to the first byte after the ClientHello.
+ *   ret: (output) on return, if a ticket was decrypted, then this is set to
+ *       point to the resulting session.
+ *
+ * If s->tls_session_secret_cb is set then we are expecting a pre-shared key
+ * ciphersuite, in which case we have no use for session tickets and one will
+ * never be decrypted, nor will s->tlsext_ticket_expected be set to 1.
+ *
+ * Returns:
+ *   -1: fatal error, either from parsing or decrypting the ticket.
+ *    0: no ticket was found (or was ignored, based on settings).
+ *    1: a zero length extension was found, indicating that the client supports
+ *       session tickets but doesn't currently have one to offer.
+ *    2: either s->tls_session_secret_cb was set, or a ticket was offered but
+ *       couldn't be decrypted because of a non-fatal error.
+ *    3: a ticket was successfully decrypted and *ret was set.
+ *
+ * Side effects:
+ *   Sets s->tlsext_ticket_expected to 1 if the server will have to issue
+ *   a new session ticket to the client because the client indicated support
+ *   (and s->tls_session_secret_cb is NULL) but the client either doesn't have
+ *   a session ticket or we couldn't use the one it gave us, or if
+ *   s->ctx->tlsext_ticket_key_cb asked to renew the client's ticket.
+ *   Otherwise, s->tlsext_ticket_expected is set to 0.
  */
-
 int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
-				const unsigned char *limit, SSL_SESSION **ret)
+			const unsigned char *limit, SSL_SESSION **ret)
 	{
 	/* Point after session ID in client hello */
 	const unsigned char *p = session_id + len;
 	unsigned short i;
 
+	*ret = NULL;
+	s->tlsext_ticket_expected = 0;
+
 	/* If tickets disabled behave as if no ticket present
- 	 * to permit stateful resumption.
- 	 */
+	 * to permit stateful resumption.
+	 */
 	if (SSL_get_options(s) & SSL_OP_NO_TICKET)
-		return 1;
-
+		return 0;
 	if ((s->version <= SSL3_VERSION) || !limit)
-		return 1;
+		return 0;
 	if (p >= limit)
 		return -1;
 	/* Skip past DTLS cookie */
@@ -1612,7 +2072,7 @@ int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
 		return -1;
 	/* Now at start of extensions */
 	if ((p + 2) >= limit)
-		return 1;
+		return 0;
 	n2s(p, i);
 	while ((p + 4) <= limit)
 		{
@@ -1620,39 +2080,61 @@ int tls1_process_ticket(SSL *s, unsigned char *session_id, int len,
 		n2s(p, type);
 		n2s(p, size);
 		if (p + size > limit)
-			return 1;
+			return 0;
 		if (type == TLSEXT_TYPE_session_ticket)
 			{
-			/* If tickets disabled indicate cache miss which will
- 			 * trigger a full handshake
- 			 */
-			if (SSL_get_options(s) & SSL_OP_NO_TICKET)
-				return 1;
-			/* If zero length note client will accept a ticket
- 			 * and indicate cache miss to trigger full handshake
- 			 */
+			int r;
 			if (size == 0)
 				{
+				/* The client will accept a ticket but doesn't
+				 * currently have one. */
 				s->tlsext_ticket_expected = 1;
-				return 0;	/* Cache miss */
+				return 1;
 				}
 			if (s->tls_session_secret_cb)
 				{
-				/* Indicate cache miss here and instead of
-				 * generating the session from ticket now,
-				 * trigger abbreviated handshake based on
-				 * external mechanism to calculate the master
-				 * secret later. */
-				return 0;
+				/* Indicate that the ticket couldn't be
+				 * decrypted rather than generating the session
+				 * from ticket now, trigger abbreviated
+				 * handshake based on external mechanism to
+				 * calculate the master secret later. */
+				return 2;
+				}
+			r = tls_decrypt_ticket(s, p, size, session_id, len, ret);
+			switch (r)
+				{
+				case 2: /* ticket couldn't be decrypted */
+					s->tlsext_ticket_expected = 1;
+					return 2;
+				case 3: /* ticket was decrypted */
+					return r;
+				case 4: /* ticket decrypted but need to renew */
+					s->tlsext_ticket_expected = 1;
+					return 3;
+				default: /* fatal error */
+					return -1;
 				}
-			return tls_decrypt_ticket(s, p, size, session_id, len,
-									ret);
 			}
 		p += size;
 		}
-	return 1;
+	return 0;
 	}
 
+/* tls_decrypt_ticket attempts to decrypt a session ticket.
+ *
+ *   etick: points to the body of the session ticket extension.
+ *   eticklen: the length of the session tickets extenion.
+ *   sess_id: points at the session ID.
+ *   sesslen: the length of the session ID.
+ *   psess: (output) on return, if a ticket was decrypted, then this is set to
+ *       point to the resulting session.
+ *
+ * Returns:
+ *   -1: fatal error, either from parsing or decrypting the ticket.
+ *    2: the ticket couldn't be decrypted.
+ *    3: a ticket was successfully decrypted and *psess was set.
+ *    4: same as 3, but the ticket needs to be renewed.
+ */
 static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 				const unsigned char *sess_id, int sesslen,
 				SSL_SESSION **psess)
@@ -1667,7 +2149,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 	SSL_CTX *tctx = s->initial_ctx;
 	/* Need at least keyname + iv + some encrypted data */
 	if (eticklen < 48)
-		goto tickerr;
+		return 2;
 	/* Initialize session ticket encryption and HMAC contexts */
 	HMAC_CTX_init(&hctx);
 	EVP_CIPHER_CTX_init(&ctx);
@@ -1679,7 +2161,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 		if (rv < 0)
 			return -1;
 		if (rv == 0)
-			goto tickerr;
+			return 2;
 		if (rv == 2)
 			renew_ticket = 1;
 		}
@@ -1687,15 +2169,15 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 		{
 		/* Check key name matches */
 		if (memcmp(etick, tctx->tlsext_tick_key_name, 16))
-			goto tickerr;
+			return 2;
 		HMAC_Init_ex(&hctx, tctx->tlsext_tick_hmac_key, 16,
 					tlsext_tick_md(), NULL);
 		EVP_DecryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL,
 				tctx->tlsext_tick_aes_key, etick + 16);
 		}
 	/* Attempt to process session ticket, first conduct sanity and
- 	 * integrity checks on ticket.
- 	 */
+	 * integrity checks on ticket.
+	 */
 	mlen = HMAC_size(&hctx);
 	if (mlen < 0)
 		{
@@ -1708,7 +2190,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 	HMAC_Final(&hctx, tick_hmac, NULL);
 	HMAC_CTX_cleanup(&hctx);
 	if (memcmp(tick_hmac, etick + eticklen, mlen))
-		goto tickerr;
+		return 2;
 	/* Attempt to decrypt session data */
 	/* Move p after IV to start of encrypted ticket, update length */
 	p = etick + 16 + EVP_CIPHER_CTX_iv_length(&ctx);
@@ -1721,33 +2203,376 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen,
 		}
 	EVP_DecryptUpdate(&ctx, sdec, &slen, p, eticklen);
 	if (EVP_DecryptFinal(&ctx, sdec + slen, &mlen) <= 0)
-		goto tickerr;
+		return 2;
 	slen += mlen;
 	EVP_CIPHER_CTX_cleanup(&ctx);
 	p = sdec;
-		
+
 	sess = d2i_SSL_SESSION(NULL, &p, slen);
 	OPENSSL_free(sdec);
 	if (sess)
 		{
-		/* The session ID if non-empty is used by some clients to
- 		 * detect that the ticket has been accepted. So we copy it to
- 		 * the session structure. If it is empty set length to zero
- 		 * as required by standard.
- 		 */
+		/* The session ID, if non-empty, is used by some clients to
+		 * detect that the ticket has been accepted. So we copy it to
+		 * the session structure. If it is empty set length to zero
+		 * as required by standard.
+		 */
 		if (sesslen)
 			memcpy(sess->session_id, sess_id, sesslen);
 		sess->session_id_length = sesslen;
 		*psess = sess;
-		s->tlsext_ticket_expected = renew_ticket;
+		if (renew_ticket)
+			return 4;
+		else
+			return 3;
+		}
+        ERR_clear_error();
+	/* For session parse failure, indicate that we need to send a new
+	 * ticket. */
+	return 2;
+	}
+
+/* Tables to translate from NIDs to TLS v1.2 ids */
+
+typedef struct 
+	{
+	int nid;
+	int id;
+	} tls12_lookup;
+
+static tls12_lookup tls12_md[] = {
+#ifndef OPENSSL_NO_MD5
+	{NID_md5, TLSEXT_hash_md5},
+#endif
+#ifndef OPENSSL_NO_SHA
+	{NID_sha1, TLSEXT_hash_sha1},
+#endif
+#ifndef OPENSSL_NO_SHA256
+	{NID_sha224, TLSEXT_hash_sha224},
+	{NID_sha256, TLSEXT_hash_sha256},
+#endif
+#ifndef OPENSSL_NO_SHA512
+	{NID_sha384, TLSEXT_hash_sha384},
+	{NID_sha512, TLSEXT_hash_sha512}
+#endif
+};
+
+static tls12_lookup tls12_sig[] = {
+#ifndef OPENSSL_NO_RSA
+	{EVP_PKEY_RSA, TLSEXT_signature_rsa},
+#endif
+#ifndef OPENSSL_NO_DSA
+	{EVP_PKEY_DSA, TLSEXT_signature_dsa},
+#endif
+#ifndef OPENSSL_NO_ECDSA
+	{EVP_PKEY_EC, TLSEXT_signature_ecdsa}
+#endif
+};
+
+static int tls12_find_id(int nid, tls12_lookup *table, size_t tlen)
+	{
+	size_t i;
+	for (i = 0; i < tlen; i++)
+		{
+		if (table[i].nid == nid)
+			return table[i].id;
+		}
+	return -1;
+	}
+#if 0
+static int tls12_find_nid(int id, tls12_lookup *table, size_t tlen)
+	{
+	size_t i;
+	for (i = 0; i < tlen; i++)
+		{
+		if (table[i].id == id)
+			return table[i].nid;
+		}
+	return -1;
+	}
+#endif
+
+int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk, const EVP_MD *md)
+	{
+	int sig_id, md_id;
+	if (!md)
+		return 0;
+	md_id = tls12_find_id(EVP_MD_type(md), tls12_md,
+				sizeof(tls12_md)/sizeof(tls12_lookup));
+	if (md_id == -1)
+		return 0;
+	sig_id = tls12_get_sigid(pk);
+	if (sig_id == -1)
+		return 0;
+	p[0] = (unsigned char)md_id;
+	p[1] = (unsigned char)sig_id;
+	return 1;
+	}
+
+int tls12_get_sigid(const EVP_PKEY *pk)
+	{
+	return tls12_find_id(pk->type, tls12_sig,
+				sizeof(tls12_sig)/sizeof(tls12_lookup));
+	}
+
+const EVP_MD *tls12_get_hash(unsigned char hash_alg)
+	{
+	switch(hash_alg)
+		{
+#ifndef OPENSSL_NO_MD5
+		case TLSEXT_hash_md5:
+#ifdef OPENSSL_FIPS
+		if (FIPS_mode())
+			return NULL;
+#endif
+		return EVP_md5();
+#endif
+#ifndef OPENSSL_NO_SHA
+		case TLSEXT_hash_sha1:
+		return EVP_sha1();
+#endif
+#ifndef OPENSSL_NO_SHA256
+		case TLSEXT_hash_sha224:
+		return EVP_sha224();
+
+		case TLSEXT_hash_sha256:
+		return EVP_sha256();
+#endif
+#ifndef OPENSSL_NO_SHA512
+		case TLSEXT_hash_sha384:
+		return EVP_sha384();
+
+		case TLSEXT_hash_sha512:
+		return EVP_sha512();
+#endif
+		default:
+		return NULL;
+
+		}
+	}
+
+/* Set preferred digest for each key type */
+
+int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize)
+	{
+	int i, idx;
+	const EVP_MD *md;
+	CERT *c = s->cert;
+	/* Extension ignored for TLS versions below 1.2 */
+	if (TLS1_get_version(s) < TLS1_2_VERSION)
 		return 1;
+	/* Should never happen */
+	if (!c)
+		return 0;
+
+	c->pkeys[SSL_PKEY_DSA_SIGN].digest = NULL;
+	c->pkeys[SSL_PKEY_RSA_SIGN].digest = NULL;
+	c->pkeys[SSL_PKEY_RSA_ENC].digest = NULL;
+	c->pkeys[SSL_PKEY_ECC].digest = NULL;
+
+	for (i = 0; i < dsize; i += 2)
+		{
+		unsigned char hash_alg = data[i], sig_alg = data[i+1];
+
+		switch(sig_alg)
+			{
+#ifndef OPENSSL_NO_RSA
+			case TLSEXT_signature_rsa:
+			idx = SSL_PKEY_RSA_SIGN;
+			break;
+#endif
+#ifndef OPENSSL_NO_DSA
+			case TLSEXT_signature_dsa:
+			idx = SSL_PKEY_DSA_SIGN;
+			break;
+#endif
+#ifndef OPENSSL_NO_ECDSA
+			case TLSEXT_signature_ecdsa:
+			idx = SSL_PKEY_ECC;
+			break;
+#endif
+			default:
+			continue;
+			}
+
+		if (c->pkeys[idx].digest == NULL)
+			{
+			md = tls12_get_hash(hash_alg);
+			if (md)
+				{
+				c->pkeys[idx].digest = md;
+				if (idx == SSL_PKEY_RSA_SIGN)
+					c->pkeys[SSL_PKEY_RSA_ENC].digest = md;
+				}
+			}
+
 		}
-	/* If session decrypt failure indicate a cache miss and set state to
- 	 * send a new ticket
- 	 */
-	tickerr:	
-	s->tlsext_ticket_expected = 1;
+
+
+	/* Set any remaining keys to default values. NOTE: if alg is not
+	 * supported it stays as NULL.
+	 */
+#ifndef OPENSSL_NO_DSA
+	if (!c->pkeys[SSL_PKEY_DSA_SIGN].digest)
+		c->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_dss1();
+#endif
+#ifndef OPENSSL_NO_RSA
+	if (!c->pkeys[SSL_PKEY_RSA_SIGN].digest)
+		{
+		c->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1();
+		c->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1();
+		}
+#endif
+#ifndef OPENSSL_NO_ECDSA
+	if (!c->pkeys[SSL_PKEY_ECC].digest)
+		c->pkeys[SSL_PKEY_ECC].digest = EVP_ecdsa();
+#endif
+	return 1;
+	}
+
+#endif
+
+#ifndef OPENSSL_NO_HEARTBEATS
+int
+tls1_process_heartbeat(SSL *s)
+	{
+	unsigned char *p = &s->s3->rrec.data[0], *pl;
+	unsigned short hbtype;
+	unsigned int payload;
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Read type and payload length first */
+	hbtype = *p++;
+	n2s(p, payload);
+	pl = p;
+
+	if (s->msg_callback)
+		s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT,
+			&s->s3->rrec.data[0], s->s3->rrec.length,
+			s, s->msg_callback_arg);
+
+	if (hbtype == TLS1_HB_REQUEST)
+		{
+		unsigned char *buffer, *bp;
+		int r;
+
+		/* Allocate memory for the response, size is 1 bytes
+		 * message type, plus 2 bytes payload length, plus
+		 * payload, plus padding
+		 */
+		buffer = OPENSSL_malloc(1 + 2 + payload + padding);
+		bp = buffer;
+		
+		/* Enter response type, length and copy payload */
+		*bp++ = TLS1_HB_RESPONSE;
+		s2n(payload, bp);
+		memcpy(bp, pl, payload);
+		bp += payload;
+		/* Random padding */
+		RAND_pseudo_bytes(bp, padding);
+
+		r = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding);
+
+		if (r >= 0 && s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buffer, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		OPENSSL_free(buffer);
+
+		if (r < 0)
+			return r;
+		}
+	else if (hbtype == TLS1_HB_RESPONSE)
+		{
+		unsigned int seq;
+		
+		/* We only send sequence numbers (2 bytes unsigned int),
+		 * and 16 random bytes, so we just try to read the
+		 * sequence number */
+		n2s(pl, seq);
+		
+		if (payload == 18 && seq == s->tlsext_hb_seq)
+			{
+			s->tlsext_hb_seq++;
+			s->tlsext_hb_pending = 0;
+			}
+		}
+
 	return 0;
 	}
 
+int
+tls1_heartbeat(SSL *s)
+	{
+	unsigned char *buf, *p;
+	int ret;
+	unsigned int payload = 18; /* Sequence number + random bytes */
+	unsigned int padding = 16; /* Use minimum padding */
+
+	/* Only send if peer supports and accepts HB requests... */
+	if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) ||
+	    s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT);
+		return -1;
+		}
+
+	/* ...and there is none in flight yet... */
+	if (s->tlsext_hb_pending)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING);
+		return -1;
+		}
+		
+	/* ...and no handshake in progress. */
+	if (SSL_in_init(s) || s->in_handshake)
+		{
+		SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE);
+		return -1;
+		}
+		
+	/* Check if padding is too long, payload and padding
+	 * must not exceed 2^14 - 3 = 16381 bytes in total.
+	 */
+	OPENSSL_assert(payload + padding <= 16381);
+
+	/* Create HeartBeat message, we just use a sequence number
+	 * as payload to distuingish different messages and add
+	 * some random stuff.
+	 *  - Message Type, 1 byte
+	 *  - Payload Length, 2 bytes (unsigned int)
+	 *  - Payload, the sequence number (2 bytes uint)
+	 *  - Payload, random bytes (16 bytes uint)
+	 *  - Padding
+	 */
+	buf = OPENSSL_malloc(1 + 2 + payload + padding);
+	p = buf;
+	/* Message Type */
+	*p++ = TLS1_HB_REQUEST;
+	/* Payload length (18 bytes here) */
+	s2n(payload, p);
+	/* Sequence number */
+	s2n(s->tlsext_hb_seq, p);
+	/* 16 random bytes */
+	RAND_pseudo_bytes(p, 16);
+	p += 16;
+	/* Random padding */
+	RAND_pseudo_bytes(p, padding);
+
+	ret = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding);
+	if (ret >= 0)
+		{
+		if (s->msg_callback)
+			s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT,
+				buf, 3 + payload + padding,
+				s, s->msg_callback_arg);
+
+		s->tlsext_hb_pending = 1;
+		}
+		
+	OPENSSL_free(buf);
+
+	return ret;
+	}
 #endif
diff --git a/src/lib/libssl/t1_meth.c b/src/lib/libssl/t1_meth.c
index 6ce7c0bbf5..53c807de28 100644
--- a/src/lib/libssl/t1_meth.c
+++ b/src/lib/libssl/t1_meth.c
@@ -60,16 +60,28 @@
 #include <openssl/objects.h>
 #include "ssl_locl.h"
 
-static const SSL_METHOD *tls1_get_method(int ver);
 static const SSL_METHOD *tls1_get_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_method());
-	else
-		return(NULL);
+		return TLSv1_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_method,
+			ssl3_accept,
+			ssl3_connect,
+			tls1_get_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_method,
+			ssl3_accept,
+			ssl3_connect,
+			tls1_get_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_method,
 			ssl3_accept,
 			ssl3_connect,
 			tls1_get_method)
diff --git a/src/lib/libssl/t1_srvr.c b/src/lib/libssl/t1_srvr.c
index 42525e9e89..f1d1565769 100644
--- a/src/lib/libssl/t1_srvr.c
+++ b/src/lib/libssl/t1_srvr.c
@@ -67,13 +67,26 @@
 static const SSL_METHOD *tls1_get_server_method(int ver);
 static const SSL_METHOD *tls1_get_server_method(int ver)
 	{
+	if (ver == TLS1_2_VERSION)
+		return TLSv1_2_server_method();
+	if (ver == TLS1_1_VERSION)
+		return TLSv1_1_server_method();
 	if (ver == TLS1_VERSION)
-		return(TLSv1_server_method());
-	else
-		return(NULL);
+		return TLSv1_server_method();
+	return NULL;
 	}
 
-IMPLEMENT_tls1_meth_func(TLSv1_server_method,
+IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_server_method,
+			ssl3_accept,
+			ssl_undefined_function,
+			tls1_get_server_method)
+
+IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_server_method,
+			ssl3_accept,
+			ssl_undefined_function,
+			tls1_get_server_method)
+
+IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_server_method,
 			ssl3_accept,
 			ssl_undefined_function,
 			tls1_get_server_method)
diff --git a/src/lib/libssl/test/CAss.cnf b/src/lib/libssl/test/CAss.cnf
index 20f8f05e3d..109bc8c10b 100644
--- a/src/lib/libssl/test/CAss.cnf
+++ b/src/lib/libssl/test/CAss.cnf
@@ -7,7 +7,7 @@ RANDFILE		= ./.rnd
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 2048
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
diff --git a/src/lib/libssl/test/P1ss.cnf b/src/lib/libssl/test/P1ss.cnf
index 876a0d35f8..326cce2ba8 100644
--- a/src/lib/libssl/test/P1ss.cnf
+++ b/src/lib/libssl/test/P1ss.cnf
@@ -7,7 +7,7 @@ RANDFILE		= ./.rnd
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 1024
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
diff --git a/src/lib/libssl/test/P2ss.cnf b/src/lib/libssl/test/P2ss.cnf
index 373a87e7c2..8b502321b8 100644
--- a/src/lib/libssl/test/P2ss.cnf
+++ b/src/lib/libssl/test/P2ss.cnf
@@ -7,7 +7,7 @@ RANDFILE		= ./.rnd
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 1024
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
diff --git a/src/lib/libssl/test/Uss.cnf b/src/lib/libssl/test/Uss.cnf
index 0c0ebb5f67..58ac0ca54d 100644
--- a/src/lib/libssl/test/Uss.cnf
+++ b/src/lib/libssl/test/Uss.cnf
@@ -7,11 +7,11 @@ RANDFILE		= ./.rnd
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 2048
 default_keyfile 	= keySS.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
-default_md		= md2
+default_md		= sha256
 
 [ req_distinguished_name ]
 countryName			= Country Name (2 letter code)
diff --git a/src/lib/libssl/test/pkits-test.pl b/src/lib/libssl/test/pkits-test.pl
index 69dffa16f9..5c6b89fcdb 100644
--- a/src/lib/libssl/test/pkits-test.pl
+++ b/src/lib/libssl/test/pkits-test.pl
@@ -784,6 +784,15 @@ my $ossl = "ossl/apps/openssl";
 
 my $ossl_cmd = "$ossl_path cms -verify -verify_retcode ";
 $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict ";
+
+# Check for expiry of trust anchor
+system "$ossl_path x509 -inform DER -in $pkitsta -checkend 0";
+if ($? == 256)
+	{
+	print STDERR "WARNING: using older expired data\n";
+	$ossl_cmd .= "-attime 1291940972 ";
+	}
+
 $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 ";
 
 system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem";
diff --git a/src/lib/libssl/test/test.cnf b/src/lib/libssl/test/test.cnf
index faad3914a8..10834442a1 100644
--- a/src/lib/libssl/test/test.cnf
+++ b/src/lib/libssl/test/test.cnf
@@ -56,7 +56,7 @@ emailAddress		= optional
 
 ####################################################################
 [ req ]
-default_bits		= 512
+default_bits		= 1024
 default_keyfile 	= testkey.pem
 distinguished_name	= req_distinguished_name
 encrypt_rsa_key		= no
diff --git a/src/lib/libssl/test/testssl b/src/lib/libssl/test/testssl
index b55364ae88..5ae4dc8720 100644
--- a/src/lib/libssl/test/testssl
+++ b/src/lib/libssl/test/testssl
@@ -148,4 +148,14 @@ $ssltest -tls1 -cipher PSK -psk abc123 $extra || exit 1
 echo test tls1 with PSK via BIO pair
 $ssltest -bio_pair -tls1 -cipher PSK -psk abc123 $extra || exit 1
 
+if ../util/shlib_wrap.sh ../apps/openssl no-srp; then
+  echo skipping SRP tests
+else
+  echo test tls1 with SRP
+  $ssltest -tls1 -cipher SRP -srpuser test -srppass abc123
+
+  echo test tls1 with SRP via BIO pair
+  $ssltest -bio_pair -tls1 -cipher SRP -srpuser test -srppass abc123
+fi
+
 exit 0
diff --git a/src/lib/libssl/tls1.h b/src/lib/libssl/tls1.h
index b3cc8f098b..c39c267f0b 100644
--- a/src/lib/libssl/tls1.h
+++ b/src/lib/libssl/tls1.h
@@ -159,10 +159,24 @@ extern "C" {
 
 #define TLS1_ALLOW_EXPERIMENTAL_CIPHERSUITES	0
 
+#define TLS1_2_VERSION			0x0303
+#define TLS1_2_VERSION_MAJOR		0x03
+#define TLS1_2_VERSION_MINOR		0x03
+
+#define TLS1_1_VERSION			0x0302
+#define TLS1_1_VERSION_MAJOR		0x03
+#define TLS1_1_VERSION_MINOR		0x02
+
 #define TLS1_VERSION			0x0301
 #define TLS1_VERSION_MAJOR		0x03
 #define TLS1_VERSION_MINOR		0x01
 
+#define TLS1_get_version(s) \
+		((s->version >> 8) == TLS1_VERSION_MAJOR ? s->version : 0)
+
+#define TLS1_get_client_version(s) \
+		((s->client_version >> 8) == TLS1_VERSION_MAJOR ? s->client_version : 0)
+
 #define TLS1_AD_DECRYPTION_FAILED	21
 #define TLS1_AD_RECORD_OVERFLOW		22
 #define TLS1_AD_UNKNOWN_CA		48	/* fatal */
@@ -183,17 +197,42 @@ extern "C" {
 #define TLS1_AD_BAD_CERTIFICATE_HASH_VALUE 114
 #define TLS1_AD_UNKNOWN_PSK_IDENTITY	115	/* fatal */
 
-/* ExtensionType values from RFC3546 / RFC4366 */
+/* ExtensionType values from RFC3546 / RFC4366 / RFC6066 */
 #define TLSEXT_TYPE_server_name			0
 #define TLSEXT_TYPE_max_fragment_length		1
 #define TLSEXT_TYPE_client_certificate_url	2
 #define TLSEXT_TYPE_trusted_ca_keys		3
 #define TLSEXT_TYPE_truncated_hmac		4
 #define TLSEXT_TYPE_status_request		5
+/* ExtensionType values from RFC4681 */
+#define TLSEXT_TYPE_user_mapping		6
+
+/* ExtensionType values from RFC5878 */
+#define TLSEXT_TYPE_client_authz		7
+#define TLSEXT_TYPE_server_authz		8
+
+/* ExtensionType values from RFC6091 */
+#define TLSEXT_TYPE_cert_type		9
+
 /* ExtensionType values from RFC4492 */
 #define TLSEXT_TYPE_elliptic_curves		10
 #define TLSEXT_TYPE_ec_point_formats		11
+
+/* ExtensionType value from RFC5054 */
+#define TLSEXT_TYPE_srp				12
+
+/* ExtensionType values from RFC5246 */
+#define TLSEXT_TYPE_signature_algorithms	13
+
+/* ExtensionType value from RFC5764 */
+#define TLSEXT_TYPE_use_srtp	14
+
+/* ExtensionType value from RFC5620 */
+#define TLSEXT_TYPE_heartbeat	15
+
+/* ExtensionType value from RFC4507 */
 #define TLSEXT_TYPE_session_ticket		35
+
 /* ExtensionType value from draft-rescorla-tls-opaque-prf-input-00.txt */
 #if 0 /* will have to be provided externally for now ,
        * i.e. build with -DTLSEXT_TYPE_opaque_prf_input=38183
@@ -204,6 +243,11 @@ extern "C" {
 /* Temporary extension type */
 #define TLSEXT_TYPE_renegotiate                 0xff01
 
+#ifndef OPENSSL_NO_NEXTPROTONEG
+/* This is not an IANA defined extension number */
+#define TLSEXT_TYPE_next_proto_neg		13172
+#endif
+
 /* NameType value from RFC 3546 */
 #define TLSEXT_NAMETYPE_host_name 0
 /* status request value from RFC 3546 */
@@ -216,12 +260,37 @@ extern "C" {
 #define TLSEXT_ECPOINTFORMAT_ansiX962_compressed_char2	2
 #define TLSEXT_ECPOINTFORMAT_last			2
 
+/* Signature and hash algorithms from RFC 5246 */
+
+#define TLSEXT_signature_anonymous			0
+#define TLSEXT_signature_rsa				1
+#define TLSEXT_signature_dsa				2
+#define TLSEXT_signature_ecdsa				3
+
+#define TLSEXT_hash_none				0
+#define TLSEXT_hash_md5					1
+#define TLSEXT_hash_sha1				2
+#define TLSEXT_hash_sha224				3
+#define TLSEXT_hash_sha256				4
+#define TLSEXT_hash_sha384				5
+#define TLSEXT_hash_sha512				6
+
 #ifndef OPENSSL_NO_TLSEXT
 
 #define TLSEXT_MAXLEN_host_name 255
 
-const char *SSL_get_servername(const SSL *s, const int type) ;
-int SSL_get_servername_type(const SSL *s) ;
+const char *SSL_get_servername(const SSL *s, const int type);
+int SSL_get_servername_type(const SSL *s);
+/* SSL_export_keying_material exports a value derived from the master secret,
+ * as specified in RFC 5705. It writes |olen| bytes to |out| given a label and
+ * optional context. (Since a zero length context is allowed, the |use_context|
+ * flag controls whether a context is included.)
+ *
+ * It returns 1 on success and zero otherwise.
+ */
+int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen,
+	const char *label, size_t llen, const unsigned char *p, size_t plen,
+	int use_context);
 
 #define SSL_set_tlsext_host_name(s,name) \
 SSL_ctrl(s,SSL_CTRL_SET_TLSEXT_HOSTNAME,TLSEXT_NAMETYPE_host_name,(char *)name)
@@ -285,6 +354,16 @@ SSL_CTX_ctrl(ctx,SSL_CTRL_SET_TLSEXT_OPAQUE_PRF_INPUT_CB_ARG, 0, arg)
 #define SSL_CTX_set_tlsext_ticket_key_cb(ssl, cb) \
 SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 
+#ifndef OPENSSL_NO_HEARTBEATS
+#define SSL_TLSEXT_HB_ENABLED				0x01
+#define SSL_TLSEXT_HB_DONT_SEND_REQUESTS	0x02
+#define SSL_TLSEXT_HB_DONT_RECV_REQUESTS	0x04
+
+#define SSL_get_tlsext_heartbeat_pending(ssl) \
+        SSL_ctrl((ssl),SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING,0,NULL)
+#define SSL_set_tlsext_heartbeat_no_requests(ssl, arg) \
+        SSL_ctrl((ssl),SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS,arg,NULL)
+#endif
 #endif
 
 /* PSK ciphersuites from 4279 */
@@ -322,6 +401,14 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_AES_256_SHA		0x03000039
 #define TLS1_CK_ADH_WITH_AES_256_SHA			0x0300003A
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_RSA_WITH_NULL_SHA256			0x0300003B
+#define TLS1_CK_RSA_WITH_AES_128_SHA256			0x0300003C
+#define TLS1_CK_RSA_WITH_AES_256_SHA256			0x0300003D
+#define TLS1_CK_DH_DSS_WITH_AES_128_SHA256		0x0300003E
+#define TLS1_CK_DH_RSA_WITH_AES_128_SHA256		0x0300003F
+#define TLS1_CK_DHE_DSS_WITH_AES_128_SHA256		0x03000040
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_128_CBC_SHA		0x03000041
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	0x03000042
@@ -330,6 +417,16 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_CAMELLIA_128_CBC_SHA	0x03000045
 #define TLS1_CK_ADH_WITH_CAMELLIA_128_CBC_SHA		0x03000046
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_CK_DHE_RSA_WITH_AES_128_SHA256		0x03000067
+#define TLS1_CK_DH_DSS_WITH_AES_256_SHA256		0x03000068
+#define TLS1_CK_DH_RSA_WITH_AES_256_SHA256		0x03000069
+#define TLS1_CK_DHE_DSS_WITH_AES_256_SHA256		0x0300006A
+#define TLS1_CK_DHE_RSA_WITH_AES_256_SHA256		0x0300006B
+#define TLS1_CK_ADH_WITH_AES_128_SHA256			0x0300006C
+#define TLS1_CK_ADH_WITH_AES_256_SHA256			0x0300006D
+
+/* Camellia ciphersuites from RFC4132 */
 #define TLS1_CK_RSA_WITH_CAMELLIA_256_CBC_SHA		0x03000084
 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_256_CBC_SHA	0x03000085
 #define TLS1_CK_DH_RSA_WITH_CAMELLIA_256_CBC_SHA	0x03000086
@@ -345,6 +442,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_DHE_RSA_WITH_SEED_SHA                   0x0300009A
 #define TLS1_CK_ADH_WITH_SEED_SHA                	0x0300009B
 
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_CK_RSA_WITH_AES_128_GCM_SHA256		0x0300009C
+#define TLS1_CK_RSA_WITH_AES_256_GCM_SHA384		0x0300009D
+#define TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256		0x0300009E
+#define TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384		0x0300009F
+#define TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256		0x030000A0
+#define TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384		0x030000A1
+#define TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256		0x030000A2
+#define TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384		0x030000A3
+#define TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256		0x030000A4
+#define TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384		0x030000A5
+#define TLS1_CK_ADH_WITH_AES_128_GCM_SHA256		0x030000A6
+#define TLS1_CK_ADH_WITH_AES_256_GCM_SHA384		0x030000A7
+
 /* ECC ciphersuites from draft-ietf-tls-ecc-12.txt with changes soon to be in draft 13 */
 #define TLS1_CK_ECDH_ECDSA_WITH_NULL_SHA                0x0300C001
 #define TLS1_CK_ECDH_ECDSA_WITH_RC4_128_SHA             0x0300C002
@@ -376,6 +487,38 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA          0x0300C018
 #define TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA          0x0300C019
 
+/* SRP ciphersuites from RFC 5054 */
+#define TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA		0x0300C01A
+#define TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	0x0300C01B
+#define TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	0x0300C01C
+#define TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA		0x0300C01D
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	0x0300C01E
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	0x0300C01F
+#define TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA		0x0300C020
+#define TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	0x0300C021
+#define TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	0x0300C022
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256         0x0300C023
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384         0x0300C024
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256          0x0300C025
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384          0x0300C026
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256           0x0300C027
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384           0x0300C028
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256            0x0300C029
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384            0x0300C02A
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256	0x0300C02B
+#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384	0x0300C02C
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256      0x0300C02D
+#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384      0x0300C02E
+#define TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256       0x0300C02F
+#define TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384       0x0300C030
+#define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256        0x0300C031
+#define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384        0x0300C032
+
 /* XXX
  * Inconsistency alert:
  * The OpenSSL names of ciphers with ephemeral DH here include the string
@@ -443,6 +586,17 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_PSK_WITH_AES_128_CBC_SHA		"PSK-AES128-CBC-SHA"
 #define TLS1_TXT_PSK_WITH_AES_256_CBC_SHA		"PSK-AES256-CBC-SHA"
 
+/* SRP ciphersuite from RFC 5054 */
+#define TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA		"SRP-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA	"SRP-RSA-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA	"SRP-DSS-3DES-EDE-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA		"SRP-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA	"SRP-RSA-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA	"SRP-DSS-AES-128-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA		"SRP-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA	"SRP-RSA-AES-256-CBC-SHA"
+#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA	"SRP-DSS-AES-256-CBC-SHA"
+
 /* Camellia ciphersuites from RFC4132 */
 #define TLS1_TXT_RSA_WITH_CAMELLIA_128_CBC_SHA		"CAMELLIA128-SHA"
 #define TLS1_TXT_DH_DSS_WITH_CAMELLIA_128_CBC_SHA	"DH-DSS-CAMELLIA128-SHA"
@@ -466,6 +620,55 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb)
 #define TLS1_TXT_DHE_RSA_WITH_SEED_SHA                  "DHE-RSA-SEED-SHA"
 #define TLS1_TXT_ADH_WITH_SEED_SHA                      "ADH-SEED-SHA"
 
+/* TLS v1.2 ciphersuites */
+#define TLS1_TXT_RSA_WITH_NULL_SHA256			"NULL-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_128_SHA256		"AES128-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_SHA256		"AES256-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_SHA256		"DH-DSS-AES128-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_SHA256		"DH-RSA-AES128-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256		"DHE-DSS-AES128-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256		"DHE-RSA-AES128-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_SHA256		"DH-DSS-AES256-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_SHA256		"DH-RSA-AES256-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256		"DHE-DSS-AES256-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256		"DHE-RSA-AES256-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_128_SHA256		"ADH-AES128-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_SHA256		"ADH-AES256-SHA256"
+
+/* TLS v1.2 GCM ciphersuites from RFC5288 */
+#define TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256		"AES128-GCM-SHA256"
+#define TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384		"AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256	"DHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384	"DHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256		"DH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384		"DH-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256	"DHE-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384	"DHE-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256		"DH-DSS-AES128-GCM-SHA256"
+#define TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384		"DH-DSS-AES256-GCM-SHA384"
+#define TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256		"ADH-AES128-GCM-SHA256"
+#define TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384		"ADH-AES256-GCM-SHA384"
+
+/* ECDH HMAC based ciphersuites from RFC5289 */
+
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256    "ECDHE-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384    "ECDHE-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256     "ECDH-ECDSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384     "ECDH-ECDSA-AES256-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256      "ECDHE-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384      "ECDHE-RSA-AES256-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256       "ECDH-RSA-AES128-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384       "ECDH-RSA-AES256-SHA384"
+
+/* ECDH GCM based ciphersuites from RFC5289 */
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256    "ECDHE-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384    "ECDHE-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256     "ECDH-ECDSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384     "ECDH-ECDSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256      "ECDHE-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384      "ECDHE-RSA-AES256-GCM-SHA384"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256       "ECDH-RSA-AES128-GCM-SHA256"
+#define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384       "ECDH-RSA-AES256-GCM-SHA384"
 
 #define TLS_CT_RSA_SIGN			1
 #define TLS_CT_DSS_SIGN			2
-- 
cgit v1.2.3-55-g6feb